utils_token_level_task.py

# coding: utf-8
# Copyright 2019 Sinovation Ventures AI Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for token level classification task."""

from __future__ import absolute_import, division, print_function

import logging
import os
import math
from random import shuffle
logger = logging.getLogger(__name__)


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths,
                 ngram_tuples, ngram_seg_ids, ngram_masks, valid_ids=None, label_mask=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.valid_ids = valid_ids
        self.label_mask = label_mask

        self.ngram_ids = ngram_ids
        self.ngram_positions = ngram_positions
        self.ngram_lengths = ngram_lengths
        self.ngram_tuples = ngram_tuples
        self.ngram_seg_ids = ngram_seg_ids
        self.ngram_masks = ngram_masks

class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        '''
        read file
        return format :
        [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
        '''
        f = open(input_file, encoding="utf-8")
        data = []
        sentence = []
        label = []
        for line in f:
            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
                if len(sentence) > 0:
                    data.append((sentence, label))
                    sentence = []
                    label = []
                continue
            splits = line.split()
            sentence.append(splits[0])
            label.append(splits[-1])

        if len(sentence) > 0:
            data.append((sentence, label))
            sentence = []
            label = []
        return data


class PosProcessor(DataProcessor):
    """Processor for the cws POS CTB5 data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        # return ["NT", "JJ", "NR", "PU", "NN", "[CLS]", "[SEP]"]
        # return ['CD', 'SB', 'DER', 'IJ', 'NR', 'CS', 'MSP', 'NN', 'LC', 'VV', 'M', 'OD', 'VE', 'AD', 'DT', 'PU', 'ETC', 'NT', 'SP','NP', 'PN', 'P', 'VP', 'VC', 'VA', 'DEC', 'FW', 'AS', 'X', 'DEG', 'BA', 'DEV', 'CC', 'JJ', 'LB', "[CLS]", "[SEP]"]
        # return ['B-NR', 'E-NR', 'B-NN', 'E-NN', 'S-CC', 'B-VV', 'E-VV', 'I-NN', 'B-NT', 'E-NT', 'S-NN', 'S-PU', 'I-NR', 'S-LC', 'S-AS', 'S-ETC', 'S-DEC', 'B-CD', 'I-CD', 'E-CD', 'S-M', 'S-DEG', 'B-JJ', 'E-JJ', 'S-VC', 'S-CD', 'I-JJ', 'B-AD', 'E-AD', 'S-AD', 'S-JJ', 'S-P', 'S-PN', 'B-VA', 'E-VA', 'S-DEV', 'S-VV', 'B-LC', 'E-LC', 'B-DT', 'E-DT', 'S-SB', 'B-OD', 'E-OD', 'B-P', 'E-P', 'S-VE', 'S-DT', 'B-M', 'E-M', 'B-CS', 'E-CS', 'B-PN', 'E-PN', 'S-VA', 'I-NT', 'I-AD', 'I-M', 'B-CC', 'E-CC', 'S-OD', 'S-MSP', 'S-NR', 'S-BA', 'I-VV', 'B-FW', 'I-FW', 'E-FW', 'B-PU', 'E-PU', 'S-CS', 'S-NT', 'I-OD', 'S-LB', 'I-VA', 'B-ETC', 'E-ETC', 'B-VE', 'E-VE', 'I-P', 'B-NP', 'E-NP', 'S-DER', 'S-SP', 'B-SP', 'E-SP', 'I-PU', 'I-PN', 'I-CC', 'B-IJ', 'E-IJ', 'I-DT', 'B-MSP', 'E-MSP', 'S-IJ', 'S-X', 'B-VC', 'I-VC', 'E-VC', 'S-FW', 'I-CS', 'S-NP', 'S-VP', "[CLS]", "[SEP]"]
        return ['NR', 'NN', 'CC', 'VV', 'NT', 'PU', 'LC', 'AS', 'ETC', 'DEC', 'CD', 'M', 'DEG', 'JJ', 'VC', 'AD', 'P',
                'PN', 'VA', 'DEV', 'DT', 'SB', 'OD', 'VE', 'CS', 'MSP', 'BA', 'FW', 'LB', 'NP', 'DER', 'SP', 'IJ', 'X',
                'VP', "[CLS]", "[SEP]"]

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class GeniaProcessor(DataProcessor):
    """Processor for the Genia data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        return ['B-G#protein_domain_or_region', 'I-G#cell_component', 'I-G#atom', 'B-G#other_artificial_source',
                'I-G#DNA_family_or_group', 'I-G#protein_molecule', 'promoter', 'I-G#nucleotide',
                'B-G#DNA_domain_or_region',
                'B-G#amino_acid_monomer', 'B-G#DNA_substructure', 'I-G#polynucleotide', 'B-G#protein_molecule',
                'B-G#other_organic_compound', 'I-G#tissue', 'B-G#mono_cell', 'I-G#RNA_N/A', 'B-G#inorganic',
                'I-G#protein_domain_or_region', 'B-G#nucleotide', 'I-G#inorganic', 'I-G#DNA_substructure',
                'B-G#DNA_molecule',
                'I-G#DNA_molecule', 'I-G#protein_substructure', 'B-G#other_name', 'I-G#other_organic_compound',
                'I-G#RNA_domain_or_region', 'I-G#RNA_molecule', 'B-G#RNA_family_or_group', 'I-G#cell_line',
                'B-G#polynucleotide',
                'I-G#peptide', 'B-G#virus', 'I-G#cell_type', 'B-G#atom', 'B-G#DNA_N/A', 'I-G#carbohydrate',
                'I-G#protein_complex',
                'B-G#cell_type', 'I-G#DNA_domain_or_region', 'B-G#cell_component', 'B-G#protein_family_or_group',
                'I-G#multi_cell',
                'I-G#body_part', 'B-G#cell_line', 'I-G#lipid', 'I-G#other_artificial_source',
                'B-G#RNA_domain_or_region',
                'B-G#protein_N/A', 'B-G#tissue', 'B-G#RNA_molecule', 'B-G#multi_cell', 'B-G#DNA_family_or_group',
                'B-G#protein_subunit', 'I-G#protein_N/A', 'I-G#RNA_family_or_group', 'B-G#body_part', 'B-G#peptide',
                'I-G#other_name', 'I-G#virus', 'I-G#protein_subunit', 'B-G#lipid', 'B-G#protein_substructure',
                'I-G#DNA_N/A',
                'B-G#protein_complex', 'I-G#protein_family_or_group', 'B-G#RNA_N/A', 'O', 'B-G#carbohydrate',
                'I-G#amino_acid_monomer', 'I-G#mono_cell', "[CLS]", "[SEP]"]

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class CwsmsraProcessor(DataProcessor):
    """Processor for the cws msra data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        return ["B", "I", "E", "S", "[CLS]", "[SEP]"]

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class PeopledailyProcessor(DataProcessor):
    """Processor for the CoNLL-2003 data set."""
    def __init__(self, dataset):
        super(PeopledailyProcessor, self).__init__()
        self.dataset = dataset.lower()

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.txt")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.txt")), "test")

    def get_labels(self):
        if "ontonote" in self.dataset:
            return ["O", "I-LOC", "B-ORG", "I-PER", "I-GPE", "B-LOC", "B-GPE", "B-PER", "I-ORG", "[CLS]", "[SEP]"]
        elif "weibo" in self.dataset:
            return ["O", "B-PER.NOM", "I-PER.NOM", "B-LOC.NAM", "I-LOC.NAM", "B-PER.NAM", "I-PER.NAM",
                    "B-GPE.NAM", "I-GPE.NAM", "B-ORG.NAM", "I-ORG.NAM", "B-ORG.NOM", "I-ORG.NOM",
                    "B-LOC.NOM", "I-LOC.NOM", "B-GPE.NOM", "I-GPE.NOM", "[CLS]", "[SEP]"]
        elif "resume" in self.dataset:
            return ["O", "I-ORG", "I-RACE", "I-PRO", "I-NAME", "B-RACE", "B-ORG", "I-LOC", "I-TITLE", "I-EDU", "B-LOC",
                    "B-TITLE", "B-CONT", "B-NAME", "I-CONT", "B-PRO", "B-EDU", "[CLS]", "[SEP]"]
        else:
            raise ValueError("dataset can not be {}".format(self.dataset))

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class ConllProcessor(DataProcessor):
    """Processor for the CoNLL-2003 data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]

    def _create_examples(self, lines, set_type):
        examples = []
        for i, (sentence, label) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = ' '.join(sentence)
            text_b = None
            label = label
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, ngram_dict):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label: i for i, label in enumerate(label_list, 1)}

    features = []
    for (ex_index, example) in enumerate(examples):
        textlist = example.text_a.split(' ')
        labellist = example.label
        tokens = []
        labels = []
        valid = []
        label_mask = []
        for i, word in enumerate(textlist):
            token = tokenizer.tokenize(word)
            tokens.extend(token)
            label_1 = labellist[i]
            for m in range(len(token)):
                if m == 0:
                    labels.append(label_1)
                    valid.append(1)
                    label_mask.append(1)
                else:
                    valid.append(0)
        if len(tokens) >= max_seq_length - 1:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]
            valid = valid[0:(max_seq_length - 2)]
            label_mask = label_mask[0:(max_seq_length - 2)]
        ntokens = []
        segment_ids = []
        label_ids = []
        ntokens.append("[CLS]")
        segment_ids.append(0)
        valid.insert(0, 1)
        label_mask.insert(0, 1)
        label_ids.append(label_map["[CLS]"])
        for i, token in enumerate(tokens):
            ntokens.append(token)
            segment_ids.append(0)
            if len(labels) > i:
                label_ids.append(label_map[labels[i]])
        ntokens.append("[SEP]")
        segment_ids.append(0)
        valid.append(1)
        label_mask.append(1)
        label_ids.append(label_map["[SEP]"])
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        input_mask = [1] * len(input_ids)
        label_mask = [1] * len(label_ids)
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append(0)
            valid.append(1)
            label_mask.append(0)
        while len(label_ids) < max_seq_length:
            label_ids.append(0)
            label_mask.append(0)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length
        assert len(valid) == max_seq_length
        assert len(label_mask) == max_seq_length

        # ----------- code for ngram BEGIN-----------
        ngram_matches = []
        #  Filter the ngram segment from 2 to 7 to check whether there is a ngram
        for p in range(2, 8):
            for q in range(0, len(tokens) - p + 1):
                character_segment = tokens[q:q + p]
                # j is the starting position of the ngram
                # i is the length of the current ngram
                character_segment = tuple(character_segment)
                if character_segment in ngram_dict.ngram_to_id_dict:
                    ngram_index = ngram_dict.ngram_to_id_dict[character_segment]
                    ngram_matches.append([ngram_index, q, p, character_segment])

        shuffle(ngram_matches)

        max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq)
        if len(ngram_matches) > max_ngram_in_seq_proportion:
            ngram_matches = ngram_matches[:max_ngram_in_seq_proportion]

        ngram_ids = [ngram[0] for ngram in ngram_matches]
        ngram_positions = [ngram[1] for ngram in ngram_matches]
        ngram_lengths = [ngram[2] for ngram in ngram_matches]
        ngram_tuples = [ngram[3] for ngram in ngram_matches]
        ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions]

        import numpy as np
        ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool)
        ngram_mask_array[:len(ngram_ids)] = 1

        # record the masked positions
        ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32)
        for i in range(len(ngram_ids)):
            ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0

        # Zero-pad up to the max ngram in seq length.
        padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids))
        ngram_ids += padding
        ngram_lengths += padding
        ngram_seg_ids += padding

        # ----------- code for ngram END-----------

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_ids,
                          ngram_ids=ngram_ids,
                          ngram_positions=ngram_positions_matrix,
                          ngram_lengths=ngram_lengths,
                          ngram_tuples=ngram_tuples,
                          ngram_seg_ids=ngram_seg_ids,
                          ngram_masks=ngram_mask_array,
                          valid_ids=valid,
                          label_mask=label_mask))
    return features

processors = {
    "conll":ConllProcessor,
    "peopledaily": PeopledailyProcessor,
    "msra": PeopledailyProcessor,
    "cwsmsra": CwsmsraProcessor,
    "cwspku": CwsmsraProcessor,
    "genia": GeniaProcessor,
    "pos":PosProcessor
}