-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
79 lines (65 loc) · 3.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from os.path import dirname, join
from underthesea_flow.flow import Flow
from underthesea_flow.model import Model
from underthesea_flow.model.crf import CRF
from underthesea_flow.reader.tagged_corpus import TaggedCorpus
from underthesea_flow.transformer.tagged import TaggedTransformer
from underthesea_flow.validation.validation import TrainTestSplitValidation
from preprocess import vlsp_chunk
if __name__ == '__main__':
# =========================================================================#
# Start an experiment with flow
# =========================================================================#
flow = Flow()
# =========================================================================#
# Data
# =========================================================================#
# for evaluation
# file = join(dirname(__file__), "corpus", "vlsp_chunk", "train.txt")
# file = join(dirname(__file__), "corpus", "vlsp_chunk_sample", "train.txt")
# sentences = load_data(file)
# for saving model
sentences = []
for f in ["train.txt", "dev.txt", "test.txt"]:
file = join(dirname(__file__), "corpus", "vlsp2016", f)
sentences += vlsp_chunk.load_data(file)
flow.data(sentences=sentences)
# =========================================================================#
# Transformer
# =========================================================================#
template = [
"T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",
"T[0].istitle", "T[-1].istitle", "T[1].istitle",
# word unigram and bigram
"T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
"T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
# pos unigram and bigram
"T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
"T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
# chunk
"T[-3][2]", "T[-2][2]", "T[-1][2]",
]
transformer = TaggedTransformer(template)
flow.transform(transformer)
# =========================================================================#
# Models
# =========================================================================#
crf_params = {
'c1': 1.0, # coefficient for L1 penalty
'c2': 1e-3, # coefficient for L2 penalty
'max_iterations': 3000, #
# include transitions that are possible, but not observed
'feature.possible_transitions': True
}
flow.add_model(Model(CRF(params=crf_params), "CRF"))
# =========================================================================#
# Evaluation
# =========================================================================#
flow.add_score('f1_chunk')
flow.add_score('accuracy_chunk')
flow.set_validation(TrainTestSplitValidation(test_size=0.1))
# =========================================================================#
# Run Experiment
# =========================================================================#
flow.train()
# flow.save_model("CRF", filename="chunking_crf_20171006.model")