fix lac log and optimize waybill (PaddlePaddle#265)

* fix lac log and optimize waybill * fix typo * note for drop cls * fix waybill cuda error * use ernie_crf_result * add copyright
wangxicoding · Apr 16, 2021 · cabf3ab · cabf3ab
1 parent ddf5d36
commit cabf3ab
Show file tree

Hide file tree

Showing 8 changed files with 323 additions and 144 deletions.
diff --git a/examples/information_extraction/waybill_ie/README.md b/examples/information_extraction/waybill_ie/README.md
@@ -41,17 +41,29 @@ python download.py --data_dir ./
 #### 启动BiGRU + CRF训练
 
 ```bash
-export CUDA_VISIBLE_DEVICES=0 # 只支持单卡训练
+export CUDA_VISIBLE_DEVICES=0
 python run_bigru_crf.py
 ```
 
-更多详细教程请参考：[基于Bi-GRU+CRF的快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1317771)
 
 #### 启动ERNIE + FC训练
 
 ```bash
-export CUDA_VISIBLE_DEVICES=0 # 只支持单卡训练
+export CUDA_VISIBLE_DEVICES=0
 python run_ernie.py
 ```
 
-更多详细教程请参考：[使用PaddleNLP预训练模型ERNIE优化快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1329361)
+
+#### 启动ERNIE + CRF训练
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python run_ernie_crf.py
+```
+
+更多详细教程请参考：
+
+[基于Bi-GRU+CRF的快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1317771)
+
+[使用PaddleNLP预训练模型ERNIE优化快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1329361)
diff --git a/examples/information_extraction/waybill_ie/data.py b/examples/information_extraction/waybill_ie/data.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.datasets import MapDataset
+
+
+def load_dict(dict_path):
+    vocab = {}
+    i = 0
+    with open(dict_path, 'r', encoding='utf-8') as fin:
+        for line in fin:
+            key = line.strip('\n')
+            vocab[key] = i
+            i += 1
+    return vocab
+
+
+def load_dataset(datafiles):
+    def read(data_path):
+        with open(data_path, 'r', encoding='utf-8') as fp:
+            next(fp)  # Skip header
+            for line in fp.readlines():
+                words, labels = line.strip('\n').split('\t')
+                words = words.split('\002')
+                labels = labels.split('\002')
+                yield words, labels
+
+    if isinstance(datafiles, str):
+        return MapDataset(list(read(datafiles)))
+    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
+        return [MapDataset(list(read(datafile))) for datafile in datafiles]
+
+
+def convert_tokens_to_ids(tokens, vocab, oov_token=None):
+    token_ids = []
+    oov_id = vocab.get(oov_token) if oov_token else None
+    for token in tokens:
+        token_id = vocab.get(token, oov_id)
+        token_ids.append(token_id)
+    return token_ids
+
+
+def convert_ernie_example(example, tokenizer, label_vocab):
+    tokens, labels = example
+    tokenized_input = tokenizer(
+        tokens, return_length=True, is_split_into_words=True)
+    # Token '[CLS]' and '[SEP]' will get label 'O'
+    labels = ['O'] + labels + ['O']
+    tokenized_input['labels'] = [label_vocab[x] for x in labels]
+    return tokenized_input['input_ids'], tokenized_input[
+        'token_type_ids'], tokenized_input['seq_len'], tokenized_input['labels']
+
+
+def parse_decodes(sentences, predictions, lengths, label_vocab):
+    """Parse the padding result
+
+    Args:
+        sentences (list): the tagging sentences.
+        predictions (list): the prediction tags.
+        lengths (list): the valid length of each sentence.
+        label_vocab (dict): the label vocab.
+
+    Returns:
+        outputs (list): the formatted output.
+    """
+    predictions = [x for batch in predictions for x in batch]
+    lengths = [x for batch in lengths for x in batch]
+    id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
+
+    outputs = []
+    for idx, end in enumerate(lengths):
+        sent = sentences[idx][:end]
+        tags = [id_label[x] for x in predictions[idx][:end]]
+        sent_out = []
+        tags_out = []
+        words = ""
+        for s, t in zip(sent, tags):
+            if t.endswith('-B') or t == 'O':
+                if len(words):
+                    sent_out.append(words)
+                tags_out.append(t.split('-')[0])
+                words = s
+            else:
+                words += s
+        if len(sent_out) < len(tags_out):
+            sent_out.append(words)
+        outputs.append(''.join(
+            [str((s, t)) for s, t in zip(sent_out, tags_out)]))
+    return outputs
diff --git a/examples/information_extraction/waybill_ie/model.py b/examples/information_extraction/waybill_ie/model.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.nn as nn
+from paddlenlp.transformers import ErniePretrainedModel
+from paddlenlp.layers.crf import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss
+
+
+class ErnieCrfForTokenClassification(nn.Layer):
+    def __init__(self, ernie, crf_lr=100):
+        super().__init__()
+        self.num_classes = ernie.num_classes
+        self.ernie = ernie  # allow ernie to be config
+        self.crf = LinearChainCrf(
+            self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False)
+        self.crf_loss = LinearChainCrfLoss(self.crf)
+        self.viterbi_decoder = ViterbiDecoder(
+            self.crf.transitions, with_start_stop_tag=False)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                lengths=None,
+                labels=None):
+        logits = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids)
+
+        if labels is not None:
+            loss = self.crf_loss(logits, lengths, labels)
+            return loss
+        else:
+            _, prediction = self.viterbi_decoder(logits, lengths)
+            return prediction
diff --git a/examples/information_extraction/waybill_ie/run_bigru_crf.py b/examples/information_extraction/waybill_ie/run_bigru_crf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,67 +21,7 @@
 from paddlenlp.metrics import ChunkEvaluator
 from paddlenlp.embeddings import TokenEmbedding
 
-
-def parse_decodes(ds, decodes, lens, label_vocab):
-    decodes = [x for batch in decodes for x in batch]
-    lens = [x for batch in lens for x in batch]
-    id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
-
-    outputs = []
-    for idx, end in enumerate(lens):
-        sent = ds.data[idx][0][:end]
-        tags = [id_label[x] for x in decodes[idx][:end]]
-        sent_out = []
-        tags_out = []
-        words = ""
-        for s, t in zip(sent, tags):
-            if t.endswith('-B') or t == 'O':
-                if len(words):
-                    sent_out.append(words)
-                tags_out.append(t.split('-')[0])
-                words = s
-            else:
-                words += s
-        if len(sent_out) < len(tags_out):
-            sent_out.append(words)
-        outputs.append(''.join(
-            [str((s, t)) for s, t in zip(sent_out, tags_out)]))
-    return outputs
-
-
-def convert_tokens_to_ids(tokens, vocab, oov_token=None):
-    token_ids = []
-    oov_id = vocab.get(oov_token) if oov_token else None
-    for token in tokens:
-        token_id = vocab.get(token, oov_id)
-        token_ids.append(token_id)
-    return token_ids
-
-
-def load_dict(dict_path):
-    vocab = {}
-    i = 0
-    for line in open(dict_path, 'r', encoding='utf-8'):
-        key = line.strip('\n')
-        vocab[key] = i
-        i += 1
-    return vocab
-
-
-def load_dataset(datafiles):
-    def read(data_path):
-        with open(data_path, 'r', encoding='utf-8') as fp:
-            next(fp)
-            for line in fp.readlines():
-                words, labels = line.strip('\n').split('\t')
-                words = words.split('\002')
-                labels = labels.split('\002')
-                yield words, labels
-
-    if isinstance(datafiles, str):
-        return MapDataset(list(read(datafiles)))
-    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
-        return [MapDataset(list(read(datafile))) for datafile in datafiles]
+from data import load_dict, load_dataset, convert_tokens_to_ids, parse_decodes
 
 
 class BiGRUWithCRF(nn.Layer):
@@ -178,7 +118,8 @@ def convert_example(example):
 
     model.evaluate(eval_data=test_loader)
     outputs, lens, decodes = model.predict(test_data=test_loader)
-    preds = parse_decodes(test_ds, decodes, lens, label_vocab)
+    sentences = [example[0] for example in test_ds.data]
+    preds = parse_decodes(sentences, decodes, lens, label_vocab)
 
     file_path = "bigru_results.txt"
     with open(file_path, "w", encoding="utf8") as fout: