-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.php
122 lines (96 loc) · 3.26 KB
/
index.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
<?php
/*
* Àíàëèçàòîð ðóññêèõ òåêñòîâ
* @author Vlad B. @ hi@vladbarinov.ru
*/
define('DICROOT', realpath(dirname(__FILE__)) . DIRECTORY_SEPARATOR);
require_once 'phpmorphy/src/common.php';
require_once 'funcs.php';
require_once 'Benchmark.php';
ini_set('display_errors', 'On');
ini_set('post_max_size', '4M');
error_reporting(E_ALL | E_STRICT);
if (!setlocale(LC_ALL, 'ru_RU.CP1251')) die('Â ñèñòåìå íåò ëîêàëè ru_RU.CP1251');
$fmsize = str_replace(array('M','K','b'), '', ini_get('post_max_size'));
$app = Benchmark::start('Ïðèëîæåíèå');
$stats = array();
$text = '';
$errors = array();
$words = array();
$unique = array();
$files = array();
$stop_words = explode(',', @file_get_contents(DICROOT . 'stop_words_ru.txt'));
// Þçåð çàãðóçèë ôàéë
if (isset($_FILES['file']) AND is_uploaded_file($_FILES['file']['tmp_name']))
{
if ($_FILES['file']['error'] === UPLOAD_ERR_OK) {
$st = Benchmark::start('Íîðìàëèçàöèÿ è ðàçáèåíèå òåêñòà');
$text = file_get_contents($_FILES['file']['tmp_name']);
$norm_text = normalize(trim($text));
$split_pattern = '/(?<!-)\b[\s]*(?!-)/';
$words = preg_split($split_pattern, $norm_text, NULL, PREG_SPLIT_NO_EMPTY);
Benchmark::stop($st);
}
else
{
echo upload_error($_FILES['file']['error']);
}
}
else if (isset($_POST) AND !empty($_POST['text'])) // Ââîä òåêñòà ÷åðåç ôîðìó
{
$st = Benchmark::start('Íîðìàëèçàöèÿ è ðàçáèåíèå òåêñòà');
$text = substr(trim($_POST['text']), 0, 200000);
$norm_text = normalize($text);
$split_pattern = '/(?<!-)\b[\s]*(?!-)/';
$words = preg_split($split_pattern, $norm_text, NULL, PREG_SPLIT_NO_EMPTY);
Benchmark::stop($st);
}
if (sizeof($words))
{
$freq = Benchmark::start('Ñîñòàâëåíèå ñëîâàðÿ');
list($dict, $dict_dynamics, $zipf) = unique($words);
Benchmark::stop($freq);
$stop = Benchmark::start('Âûäåëåíèå ñòîï-ñëîâ');
$sane_words = remove_stop($words, $stop_words);
list($dict_stop) = unique($sane_words);
Benchmark::stop($stop);
}
// Ñîçäàíèå íóæíûõ ôàéëîâ ñëîâàðåé
if (isset($_POST['freq']) AND !empty($dict))
{
// Îáû÷íûé ÷àñòîòíûé ñëîâàðü
$files['×àñòîòíûé ñëîâàðü'] = create_dic($dict, 'freq');
}
if (isset($_POST['stop']) AND !empty($dict_stop))
{
// ñëîâàðü ïîñëå óäàëåíèÿ ñòîï ñëîâ
$files['×àñòîòíûé ñëîâàðü (áåç ñòîï-ñëîâ)'] = create_dic($dict_stop, 'stop');
}
if (isset($_POST['morph']) AND !empty($sane_words))
{
$morph = Benchmark::start('Ìîðôîëãèÿ');
if (isset($_POST['mtype']) AND $_POST['mtype'] == 1)
{ // mystem
if ($dict_morph = mystem($sane_words)) {
$files['×àñòîòíûé ñëîâàðü (áåç ñòîï-ñëîâ, c ìîðôîëîãèåé)'] = $dict_morph;
}
}
else if (isset($_POST['mtype']) AND $_POST['mtype'] == 2)
{ // phpmorphy
list($dict_morph) = unique($sane_words, TRUE);
$files['×àñòîòíûé ñëîâàðü (áåç ñòîï-ñëîâ, c ìîðôîëîãèåé)'] = create_dic($dict_morph, 'morph');
}
Benchmark::stop($morph);
}
// Äëÿ ñòàòèñòèêè
if ($text)
{
$number_of_symbols = strlen($text);
$spaces = substr_count($text, ' ');
$stop_words_actual = array_intersect($words, $stop_words);
list($stop_words_array) = unique($stop_words_actual);
}
Benchmark::stop($app);
$stats = Benchmark::statistics();
// Îñíîâíîé øàáëîí
include('view.php');