Skip to content

Commit

Permalink
fix lac log and optimize waybill (PaddlePaddle#265)
Browse files Browse the repository at this point in the history
* fix lac log and optimize waybill

* fix typo

* note for drop cls

* fix waybill cuda error

* use ernie_crf_result

* add copyright
  • Loading branch information
kinghuin authored Apr 16, 2021
1 parent ddf5d36 commit cabf3ab
Show file tree
Hide file tree
Showing 8 changed files with 323 additions and 144 deletions.
20 changes: 16 additions & 4 deletions examples/information_extraction/waybill_ie/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,29 @@ python download.py --data_dir ./
#### 启动BiGRU + CRF训练

```bash
export CUDA_VISIBLE_DEVICES=0 # 只支持单卡训练
export CUDA_VISIBLE_DEVICES=0
python run_bigru_crf.py
```

更多详细教程请参考:[基于Bi-GRU+CRF的快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1317771)

#### 启动ERNIE + FC训练

```bash
export CUDA_VISIBLE_DEVICES=0 # 只支持单卡训练
export CUDA_VISIBLE_DEVICES=0
python run_ernie.py
```

更多详细教程请参考:[使用PaddleNLP预训练模型ERNIE优化快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1329361)

#### 启动ERNIE + CRF训练


```bash
export CUDA_VISIBLE_DEVICES=0
python run_ernie_crf.py
```

更多详细教程请参考:

[基于Bi-GRU+CRF的快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1317771)

[使用PaddleNLP预训练模型ERNIE优化快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/1329361)
100 changes: 100 additions & 0 deletions examples/information_extraction/waybill_ie/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddlenlp.datasets import MapDataset


def load_dict(dict_path):
vocab = {}
i = 0
with open(dict_path, 'r', encoding='utf-8') as fin:
for line in fin:
key = line.strip('\n')
vocab[key] = i
i += 1
return vocab


def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp) # Skip header
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
yield words, labels

if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]


def convert_tokens_to_ids(tokens, vocab, oov_token=None):
token_ids = []
oov_id = vocab.get(oov_token) if oov_token else None
for token in tokens:
token_id = vocab.get(token, oov_id)
token_ids.append(token_id)
return token_ids


def convert_ernie_example(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(
tokens, return_length=True, is_split_into_words=True)
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
return tokenized_input['input_ids'], tokenized_input[
'token_type_ids'], tokenized_input['seq_len'], tokenized_input['labels']


def parse_decodes(sentences, predictions, lengths, label_vocab):
"""Parse the padding result
Args:
sentences (list): the tagging sentences.
predictions (list): the prediction tags.
lengths (list): the valid length of each sentence.
label_vocab (dict): the label vocab.
Returns:
outputs (list): the formatted output.
"""
predictions = [x for batch in predictions for x in batch]
lengths = [x for batch in lengths for x in batch]
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

outputs = []
for idx, end in enumerate(lengths):
sent = sentences[idx][:end]
tags = [id_label[x] for x in predictions[idx][:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
if t.endswith('-B') or t == 'O':
if len(words):
sent_out.append(words)
tags_out.append(t.split('-')[0])
words = s
else:
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
outputs.append(''.join(
[str((s, t)) for s, t in zip(sent_out, tags_out)]))
return outputs
49 changes: 49 additions & 0 deletions examples/information_extraction/waybill_ie/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle.nn as nn
from paddlenlp.transformers import ErniePretrainedModel
from paddlenlp.layers.crf import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss


class ErnieCrfForTokenClassification(nn.Layer):
def __init__(self, ernie, crf_lr=100):
super().__init__()
self.num_classes = ernie.num_classes
self.ernie = ernie # allow ernie to be config
self.crf = LinearChainCrf(
self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False)
self.crf_loss = LinearChainCrfLoss(self.crf)
self.viterbi_decoder = ViterbiDecoder(
self.crf.transitions, with_start_stop_tag=False)

def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None,
lengths=None,
labels=None):
logits = self.ernie(
input_ids,
token_type_ids=token_type_ids,
attention_mask=attention_mask,
position_ids=position_ids)

if labels is not None:
loss = self.crf_loss(logits, lengths, labels)
return loss
else:
_, prediction = self.viterbi_decoder(logits, lengths)
return prediction
67 changes: 4 additions & 63 deletions examples/information_extraction/waybill_ie/run_bigru_crf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -21,67 +21,7 @@
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.embeddings import TokenEmbedding


def parse_decodes(ds, decodes, lens, label_vocab):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

outputs = []
for idx, end in enumerate(lens):
sent = ds.data[idx][0][:end]
tags = [id_label[x] for x in decodes[idx][:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
if t.endswith('-B') or t == 'O':
if len(words):
sent_out.append(words)
tags_out.append(t.split('-')[0])
words = s
else:
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
outputs.append(''.join(
[str((s, t)) for s, t in zip(sent_out, tags_out)]))
return outputs


def convert_tokens_to_ids(tokens, vocab, oov_token=None):
token_ids = []
oov_id = vocab.get(oov_token) if oov_token else None
for token in tokens:
token_id = vocab.get(token, oov_id)
token_ids.append(token_id)
return token_ids


def load_dict(dict_path):
vocab = {}
i = 0
for line in open(dict_path, 'r', encoding='utf-8'):
key = line.strip('\n')
vocab[key] = i
i += 1
return vocab


def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp)
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
yield words, labels

if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]
from data import load_dict, load_dataset, convert_tokens_to_ids, parse_decodes


class BiGRUWithCRF(nn.Layer):
Expand Down Expand Up @@ -178,7 +118,8 @@ def convert_example(example):

model.evaluate(eval_data=test_loader)
outputs, lens, decodes = model.predict(test_data=test_loader)
preds = parse_decodes(test_ds, decodes, lens, label_vocab)
sentences = [example[0] for example in test_ds.data]
preds = parse_decodes(sentences, decodes, lens, label_vocab)

file_path = "bigru_results.txt"
with open(file_path, "w", encoding="utf8") as fout:
Expand Down
Loading

0 comments on commit cabf3ab

Please sign in to comment.