entity_linking.py

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pickle
from collections import Counter

import editdistance
import numpy as np
import torch

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.cuda()
roberta.eval()
roberta.mapping = {0: 'contradiction', 1: 'neutral', 2: 'entail'}
softmax_convert = torch.nn.Softmax(dim=1)


def bi_jaccard_similarity(a, b):
    # Add start and end tokens
    a, b = '§' + a.lower() + '±', '§' + b.lower() + '±'

    # Exactly the same string
    if a == b: return 1.0

    # Find a multiset of bigrams of each string using Counters
    a_bigrams = Counter([a[i:i + 2] for i in range(len(a) - 1)])
    b_bigrams = Counter([b[i:i + 2] for i in range(len(b) - 1)])

    # Intersection over union (in a multiset way) using Counters
    return sum((a_bigrams & b_bigrams).values()) / sum((a_bigrams | b_bigrams).values())


def bi_jaccard_distance(a, b):
    return 1 - bi_jaccard_similarity(a, b)


def leven_distance(a, b):
    return editdistance.eval(a.lower(), b.lower())


def leven_distance_norm(a, b):
    return editdistance.eval(a.lower(), b.lower()) / len(a)


def get_closest(a, the_list, distance_func):
    sim_scores = [distance_func(a, b) for b in the_list]
    return the_list[sim_scores.index(min(sim_scores))], min(sim_scores)


def entity_linking(v, distinct_slot_values, method_name, threshold=None, slot_name=None):
    if method_name == 'bijaccard':
        distance = bi_jaccard_distance
    elif method_name == 'edit_distance':
        distance = leven_distance  # Case in-sensitive
    elif method_name == 'edit_distance_norm':
        distance = leven_distance_norm  # Case in-sensitive
    elif method_name == 'roberta_mnli':
        distance = roberta_mnli_distance
    elif method_name == 'average_three':
        distance = average_three_distance
    elif method_name == 'exact':
        if v in distinct_slot_values:
            return v
        else:
            return None
    elif method_name == 'exact_nocase':
        for e in distinct_slot_values:
            if e.lower() == v.lower():
                return e
        return None
    else:
        raise Exception(f'Invalid entity linking method name: {method_name}')
    candidate, score = get_closest(v, distinct_slot_values, distance)
    if threshold is None:
        return candidate
    else:
        if score > threshold:
            return None
        else:
            return candidate


def get_probs(roberta, pair):
    if pair not in memorise_roberta:
        tokens = roberta.encode(*pair)
        probs = list(softmax_convert(roberta.predict('mnli', tokens))[0].to(torch.device('cpu')))
        probs = [float(a) for a in probs]
        memorise_roberta[pair] = probs
    return memorise_roberta[pair]


def roberta_mnli_distance(text, entity):
    return 1 - get_probs(roberta, (text, entity))[2]


def roberta_mnli_ne_distance(text, entity):
    probs = get_probs(roberta, (text, entity))
    return probs[0]


def roberta_mnli_maxne_distance(text, entity):
    probs = get_probs(roberta, (text, entity))
    return 1 - max(probs[1], probs[2])


def average_three_distance(text, entity):
    a = bi_jaccard_distance(text, entity)
    b = leven_distance(text, entity) / len(text)
    c = roberta_mnli_distance(text, entity)
    return (a + b + c) / 3


def entity_linking_list(slot_values, distinct_slot_values, method, threshold=None):
    ans = dict()
    for s, l in slot_values.items():
        if s in distinct_slot_values:
            ans[s] = [entity_linking(item, distinct_slot_values[s], method, threshold, s) for item in l]
        else:
            ans[s] = l
    return ans


def get_ranking(a, the_list, distance_func):
    sim_scores = [(b, distance_func(a, b)) for b in the_list]
    sim_scores.sort(key=lambda x: x[1])
    return sim_scores


def entity_sorting(v, distinct_slot_values, method_name, threshold=None, slot_name=None):
    if method_name == 'bijaccard':
        distance = bi_jaccard_distance
    elif method_name == 'edit_distance':
        distance = leven_distance  # Case in-sensitive
    elif method_name == 'roberta_mnli':
        distance = roberta_mnli_distance
    elif method_name == 'average_three':
        distance = average_three_distance
    elif method_name == 'exact':
        if v in distinct_slot_values:
            return [v, None]
        else:
            return [None]
    elif method_name == 'exact_nocase':
        for e in distinct_slot_values:
            if e.lower() == v.lower():
                return [e, None]
        return [None]
    else:
        raise Exception(f'Invalid entity linking method name: {method_name}')

    candidate = get_ranking(v, distinct_slot_values, distance)
    if threshold is None:
        final_candidate = candidate + [(None, 1e10)]
        return [p[0] for p in final_candidate]
    else:
        final_candidate = []
        for idx, p in enumerate(candidate):
            if p[1] <= threshold:
                final_candidate.append(p)
            else:
                final_candidate.append((None, threshold))
                final_candidate = final_candidate + candidate[idx:]
                break
        return [p[0] for p in final_candidate]


def calculate_prf_one_group_entity(all_gts, all_pds, raw_pds, distinct_slot_values):
    common = all_gts.intersection(all_pds)
    try:
        precision = len(common) / len(all_pds)
    except:
        precision = None
    try:
        recall = len(common) / len(all_gts)
    except:
        recall = None
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except:
        f1 = None

    none_gts = set([t for t in all_gts if t[-1] is None])
    try:
        none_recall = len(none_gts.intersection(all_pds)) / len(none_gts)
    except:
        none_recall = None

    all_gts_dict = {f'{t[0]}-{t[1]}': t[-1] for t in all_gts}
    all_pds_dict = {f'{t[0]}-{t[1]}': t[-1] for t in all_pds}
    raw_pds_dict = {f'{t[0]}-{t[1]}': list(t)[2:] for t in raw_pds}
    common_slots = set(all_gts_dict.keys()).intersection(set(all_pds_dict.keys()))
    try:
        accuracy = sum([1 for s in common_slots if all_gts_dict[s] == all_pds_dict[s]]) / len(common_slots)
    except:
        accuracy = None

    link_accuracy_at = dict()
    count_at = {n: 0 for n in [2, 3, 5, 10]}
    if len(common_slots) == 0:
        for n in [2, 3, 5, 10]:
            link_accuracy_at[n] = None
    else:
        for s in common_slots:
            sorted_entities = entity_sorting(raw_pds_dict[s][0], distinct_slot_values[raw_pds_dict[s][-1]],
                                             raw_pds_dict[s][1], threshold=raw_pds_dict[s][2],
                                             slot_name=raw_pds_dict[s][-1])
            for n in [2, 3, 5, 10]:
                if n >= len(distinct_slot_values[raw_pds_dict[s][-1]]) + 1:
                    count_at[n] += 1
                else:
                    top_n = sorted_entities[:min(n, len(sorted_entities))]
                    if all_gts_dict[s] in top_n:
                        count_at[n] += 1
        for n in [2, 3, 5, 10]:
            link_accuracy_at[n] = count_at[n] / len(common_slots)

    return {'none_recall': none_recall,
            'link_accuracy': accuracy,
            'link_accuracy_at_2': link_accuracy_at[2],
            'link_accuracy_at_3': link_accuracy_at[3],
            'link_accuracy_at_5': link_accuracy_at[5],
            'link_accuracy_at_10': link_accuracy_at[10],
            'precision': precision,
            'recall': recall,
            'f1': f1}


def calculate_prf_entity(all_gts, all_pds, raw_pds, distinct_slot_values):
    ans = calculate_prf_one_group_entity(all_gts, all_pds, raw_pds, distinct_slot_values)
    type_stats = {}
    for stype in distinct_slot_values:
        this_gts = set([t for t in all_gts if t[1] == stype])
        this_pds = set([t for t in all_pds if t[1] == stype])
        this_raw_pds = set([t for t in raw_pds if t[1] == stype])
        type_stats[stype] = calculate_prf_one_group_entity(this_gts, this_pds, this_raw_pds, distinct_slot_values)
    ans['type_stats'] = type_stats
    return ans


def get_linking_results_of_method(true_convers, predict_convers, distinct_slot_values, method_name, threshold=None):
    assert len(true_convers) == len(predict_convers)
    all_gts = set()
    all_pds = set()
    raw_pds = set()

    for key in true_convers:
        assert key in predict_convers
        assert len(true_convers[key]) == len(predict_convers[key])

        for idx in range(len(true_convers[key])):
            gt = true_convers[key][idx]
            pd = predict_convers[key][idx]
            assert gt['utteranceId'] == pd['utteranceId'], f"{gt['utteranceId']} -- {pd['utteranceId']}"
            gt_intents = set(gt['intent'].split('<div>'))
            pd_intents = set(pd['intents'].split('<div>'))
            gt_slots = set(
                [(gt['utteranceId'], k, vi) for k, v in gt['entity_values'].items() for vi in v if (k in distinct_slot_values)])
            pd_slots = set(
                [(pd['utteranceId'], k, vi) for k, v in pd['entity_values'].items() for vi in v if (k in distinct_slot_values)])
            raw_pd_slots = set(
                [(pd['utteranceId'], k, vi, method_name, threshold, k) for k, v in pd['slot_values'].items() for vi in v
                 if (k in distinct_slot_values)])
            if len(gt_intents.intersection(pd_intents)) > 0:
                all_gts = all_gts.union(gt_slots)
                all_pds = all_pds.union(pd_slots)
                raw_pds = raw_pds.union(raw_pd_slots)
    return calculate_prf_entity(all_gts, all_pds, raw_pds, distinct_slot_values)

def entity_linking_list_probs(slot_values, distinct_slot_values, softmaxtemp, method, threshold = None):
    ans = dict()
    for s, l in slot_values.items():
        if s in distinct_slot_values:
            ans[s] = [entity_linking_probs(item, distinct_slot_values[s], softmaxtemp, method, threshold) for item in l]
        else:
            ans[s] = l
    return ans

def softmax(x, T = 1):
    """Compute softmax values for each sets of scores in x."""
    x = np.array(x)
    x = x / T
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


def get_scores(a, the_list, distance_func, none_threshold, softmaxtemp):
    scores = [1 - distance_func(a, b) for b in the_list]
    if none_threshold is not None:
        scores = scores + [1 - none_threshold]
    else:
        scores = scores + [0]
    return softmax(scores, T=softmaxtemp)


def entity_linking_probs(v, distinct_slot_values, softmaxtemp, method_name, threshold=None):
    if method_name == 'bijaccard':
        distance = bi_jaccard_distance
    elif method_name == 'edit_distance':
        distance = leven_distance  # Case in-sensitive
    elif method_name == 'edit_distance_norm':
        distance = leven_distance_norm  # Case in-sensitive
    elif method_name == 'roberta_mnli':
        distance = roberta_mnli_distance
    elif method_name == 'average_three':
        distance = average_three_distance
    else:
        raise Exception(f'Invalid entity linking method name: {method_name}')

    probs = get_scores(v, distinct_slot_values, distance, threshold, softmaxtemp)
    return probs

if os.path.exists('memorise_roberta.pickle'):
    memorise_roberta = pickle.load(open('memorise_roberta.pickle', 'rb'))
else:
    memorise_roberta = dict()