-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpos.go
193 lines (167 loc) · 3.89 KB
/
pos.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
package pos
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"os"
"os/exec"
"runtime"
"strings"
)
// Descriptions - word tags description
// https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
var Descriptions = map[string]string{
"CC": "Coordinating conjunction",
"CD": "Cardinal number",
"DT": "Determiner",
"EX": "Existential there",
"FW": "Foreign word",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"LS": "List item marker",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NNP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"SYM": "Symbol",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb",
}
// Tagger struct
type Tagger struct {
model string
tagger string
java string
opts []string
separator string
encoding string
}
// Result struct
type Result struct {
Word string
TAG string
}
// Description - returns tag description
func (r *Result) Description() string {
if _, exists := Descriptions[r.TAG]; !exists {
return ""
}
return Descriptions[r.TAG]
}
// NewTagger - returns Tagger pointer
func NewTagger(m, t string) (*Tagger, error) {
separator := ":"
if runtime.GOOS == "windows" {
separator = ";"
}
pos := &Tagger{
java: "java",
encoding: "utf8",
opts: []string{"-mx300m"},
separator: separator,
}
if err := pos.SetModel(m); err != nil {
return nil, err
}
if err := pos.SetTagger(t); err != nil {
return nil, err
}
return pos, nil
}
// SetModel - set stanford pos tagger model
func (p *Tagger) SetModel(m string) error {
if _, err := os.Stat(m); err != nil {
return errors.New("Model not exists!")
}
p.model = m
return nil
}
// SetTagger - set stanford pos tagger jar file
func (p *Tagger) SetTagger(t string) error {
if _, err := os.Stat(t); err != nil {
return errors.New("Tagger not exists!")
}
p.tagger = t
return nil
}
// SetJavaPath - set path to java executable file
func (p *Tagger) SetJavaPath(j string) {
p.java = j
}
// SetJavaOpts - set java options (default: [mx300m])
func (p *Tagger) SetJavaOpts(opts []string) {
p.opts = opts
}
// SetEncoding - set outupt encoding (default: utf8)
func (p *Tagger) SetEncoding(e string) {
p.encoding = e
}
func (p *Tagger) parse(out string) []*Result {
words := strings.Split(out, " ")
res := make([]*Result, len(words))
for i, word := range words {
split := strings.Split(word, "_")
res[i] = &Result{
Word: split[0],
TAG: split[1],
}
}
return res
}
// Tag - use stanford pos tagger to tag input sentence
func (p *Tagger) Tag(input string) ([]*Result, error) {
var (
tmp *os.File
err error
args []string
)
if tmp, err = ioutil.TempFile("", "nlptemp"); err != nil {
return nil, err
}
defer os.Remove(tmp.Name())
if _, err = tmp.WriteString(input); err != nil {
return nil, err
}
args = append(p.opts, []string{
"-cp",
p.tagger + p.separator,
"edu.stanford.nlp.tagger.maxent.MaxentTagger",
"-model",
p.model,
"-textFile",
tmp.Name(),
"-encoding",
p.encoding,
}...)
cmd := exec.Command(p.java, args...)
var out bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = &stderr
if err = cmd.Run(); err != nil {
return nil, fmt.Errorf("%s: %s", err, stderr.String())
}
return p.parse(out.String()), nil
}