forked from johsw/tagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tagger.php
294 lines (256 loc) · 8.4 KB
/
Tagger.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
<?php
/**
* @file
* Contains Tagger.
*
* The base of the Tagger library.
*/
/**
* The full path of the root directory of Tagger.
*/
define('__ROOT__', dirname(__FILE__) . '/');
/**
* The Tagger version number.
*/
define('TAGGER_VERSION', 4);
mb_internal_encoding('UTF-8');
require_once __ROOT__ . 'classes/TaggerHelpers.class.php';
require_once __ROOT__ . 'classes/TagProcessor.class.php';
require_once __ROOT__ . 'logger/TaggerLogManager.class.php';
/**
* The Tagger root class
*
* A singleton that can be accessed statically
*
*/
class Tagger {
/**
* The singleton instance.
*/
private static $instance;
/**
* The Tagger configuration.
*/
private static $configuration;
/**
* Variable for holding the list of initwords.
*/
public static $initwords;
/**
* Variable for holding the list of prefix/infix words.
*/
public static $prefix_infix;
/**
* Variable for holding the list of stopwords.
*/
public static $stopwords;
/**
* When overriding the configuration the settings in this array will be fully
* overriden (not appended to). Defaults to array('vocab_ids').
*/
private static $override = array('vocab_ids');
/**
* Constructs the Tagger object.
*
* @param array $configuration
* Configuration for the current Tagger session.
* @param string $file
* Configuration file to be loaded. Defaults to 'conf.php'.
*/
private function __construct($configuration = array(), $file = 'conf.php') {
set_include_path(get_include_path() . PATH_SEPARATOR . dirname(__FILE__));
define('TAGGER_DIR', dirname(__FILE__));
include 'defaults.php';
$tagger_conf = array_merge($tagger_conf, $configuration);
if (!isset($configuration) || empty($configuration)) {
if(file_exists(__ROOT__ . $file)) {
include $file;
}
else {
throw new Exception("Configuration file '$file' not found.", 1);
}
}
self::$configuration = $tagger_conf;
$wordlists = array('initwords', 'prefix_infix', 'stopwords');
foreach ($wordlists AS $wordlist) {
if (self::$$wordlist == NULL) {
$path = realpath(__ROOT__ .'resources/'. $wordlist .'/'. $wordlist .'_'.
self::$configuration['language'] .'.txt');
self::$$wordlist = array_flip(file($path, FILE_IGNORE_NEW_LINES));
}
}
}
/**
* Returns Tagger version number.
*/
public static function getTaggerVersion() {
return TAGGER_VERSION;
}
/**
* Returns vocabulary ids.
*/
public function getVocabularyIds() {
$sql = sprintf("SELECT vid FROM tagger_lookup GROUP BY vid");
$result = TaggerQueryManager::query($sql);
$ids = array();
while ($row = TaggerQueryManager::fetch($result)) {
$ids[$row['vid']] = $row['vid'];
}
return $ids;
}
/**
* Returns singleton Tagger instance.
*/
public static function getTagger($configuration = array(), $file = 'conf.php') {
if (!isset(self::$instance)) {
$c = __CLASS__;
self::$instance = new $c($configuration, $file);
}
return self::$instance;
}
/**
* Returns either full configuration or a single setting.
*
* If called with no arguments this function returns the full configuration
* array.
* If called with arguments, each argument is a key in the configuration array
* i.e. @code getConfiguration('keyword', 'vocab_ids') == $configuration['keyword']['vocab_ids']; @endcode
*/
public static function getConfiguration() {
self::getTagger();
$arg_count = func_num_args();
if ($arg_count = 0) {
return self::$configuration;
}
else {
$opt = self::$configuration;
$setting_str = '$configuration';
foreach(func_get_args() as $arg) {
$setting_str .= "['$arg']";
if (isset($opt[$arg])) {
$opt = $opt[$arg];
} else {
throw new ErrorException('Setting ' . $setting_str . ' not found in configuration.');
}
}
return $opt;
}
}
/**
* Sets either full configuration or a single setting.
*
* If called with array arguments this merges the current configuration
* with the arguments.
* If called with non-array arguments the first argument is the new value of
* the setting. Each following argument is a key in the configuration array.
* i.e. @code setConfiguration(array(17), 'keyword', 'vocab_ids'); @endcode
* is equivalent to
* @code $configuration['keyword']['vocab_ids'] = array(17); @endcode
*/
public static function setConfiguration() {
$arg_count = func_num_args();
$args = func_get_args();
// if all arguments are arrays - merge them
if ( !in_array(FALSE, array_map('is_array', $args), TRUE) ) {
self::$configuration = call_user_func_array(
array('TaggerHelpers', 'arrayMergeRecursiveOverride'),
array_merge(array(self::$override, self::$configuration), $args)
);
return self::$configuration;
}
// if all arguments are strings - set the option specified
if ( !in_array(FALSE, array_map('is_string', $args), TRUE) ) {
if ($arg_count < 2) {
throw new ErrorException('Need at least two arguments.');
}
$opt =& self::$configuration;
$l = array_slice(func_get_args(), 1);
$setting_str = '$configuration';
foreach($l as $arg) {
$setting_str .= "['$arg']";
if (is_array($opt) && isset($opt[$arg])) {
$opt =& $opt[$arg];
} else {
throw new ErrorException('Setting ' . $setting_str . ' not found in configuration.');
}
}
$opt = func_get_arg(0);
return $opt;
}
}
// Prevent users to clone the instance
public function __clone() {
trigger_error('Clone is not allowed.', E_USER_ERROR);
}
/**
* This is the main function to call, when you use Tagger.
*
* @param $text
* The text you want to tag.
* @param array $options
* An associative array of additional options, with the following elements:
* - 'ner_vocab_ids': An numeric array vocabularies you want to use for
* NER (named entity recognition). Keys are vocabulary ids. Values are
* vocabulary names.
* - 'keyword_vocab_ids': An numeric array vocabularies you want to use for
* Keyword Extraction´. Keys are vocabulary ids. Values are vocabulary names.
* - 'rate_html': Boolean indication wheter html-tags should be used to rate
* relevancy.
* - 'return_marked_text': Boolean, indicates whether Tagger should return
* text with markup.
* - 'rating': An array TODO: explain array
* - 'disambiguate': Boolean indicating whether Tagger should try to disambiguate
* ambigous tags.
* - 'return_uris': Boolean indicating wheter Tagger should return URI's for
* each tag
* - 'log_unmatched': Boolean indicating whether unmatched potential
* NER candidates should be logged
* - 'nl2br': Boolean indicating whether newlines should be convertet to br-tags
*
* @return
* TagProcessor object
*/
public function tagText($text, $options = array()) {
$default = self::$configuration;
$filtered_conf = array();
//These configuration options can be overriden when this function is called
$conf = array(
'named_entity',
'keyword',
'return_marked_text',
'linked_data',
);
foreach ($conf as $key) {
if (isset($options[$key])) {
$filtered_conf[$key] = $options[$key];
}
}
// let some $options override $configuration temporarily
self::setConfiguration(self::$configuration, $filtered_conf);
$ner = Tagger::getConfiguration('named_entity', 'vocab_ids');
$keyword = Tagger::getConfiguration('keyword', 'vocab_ids');
if (empty($ner) && empty($keyword)) {
throw new ErrorException('Missing vocab definition in configuration.');
}
$tag_processor = new TagProcessor($text);
$tag_processor->process();
self::$configuration = $default;
return $tag_processor;
}
/**
* Log to the internal Tagger log
*
* @param string $message
* The text to be logged
* @param string $level
* The logging level of the message ('Verbose', 'Warning', 'Standard')
* Defaults to 'Standard'.
*/
public function log($message, $level = 'Standard') {
$level = array_search($level, TaggerLogManager::$LOG_TYPE);
if ($level === FALSE) {
$level = TaggerLogManager::STANDARD;
}
TaggerLogManager::logMsg($message, $level);
}
}