-
Notifications
You must be signed in to change notification settings - Fork 11
/
wordwisecreator.php
160 lines (119 loc) · 4.85 KB
/
wordwisecreator.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
<?php
if(!isset($argv[1])){
echo 'usage: '.$argv[0]." input_file hint_level\n";
echo "input_file : path to file need to generate wordwise \n";
echo "hint_level : from 1 to 5 default is 5, 1 is less wordwise hint show - only hard word will have definition, 5 is all wordwise hints show\n";
die();
}else{
$bookfile = $argv[1];
$bookpath = pathinfo($bookfile, PATHINFO_DIRNAME);
$bookfilename = pathinfo($bookfile, PATHINFO_FILENAME);
if(!isset($argv[2])){
$hint_level = 5;
}else{
$hint_level = $argv[2];
}
}
echo "[+] Hint level: $hint_level \n";
//Load Stop Words
echo "[+] Load Stop Words \n";
$stopwords = file('stopwords.txt', FILE_IGNORE_NEW_LINES);
//Load Dict from CSV
echo "[+] Load Wordwise Dict \n";
$lines = explode( "\n", file_get_contents( 'wordwise-dict.csv' ) );
$headers = str_getcsv( array_shift( $lines ) );
$data = array();
foreach ( $lines as $line ) {
$row = array();
foreach ( str_getcsv( $line ) as $key => $field )
$row[ $headers[ $key ] ] = $field;
$row = array_filter( $row );
$data[] = $row;
}
$wordwise_dict = $data;
//clean temp
echo "[+] Clean old temps \n";
if(file_exists('book_dump.htmlz')){
unlink('book_dump.htmlz');
}
if(file_exists('book_dump_html')){
deleteDir('book_dump_html');
}
//Convert Book to HTML
echo "[+] Convert Book to HTML \n";
//shell_exec('ebook-convert .\everybodylies.mobi .\book_dump_html');
shell_exec('ebook-convert "'.$bookfile.'" .\book_dump.htmlz');
shell_exec('ebook-convert .\book_dump.htmlz .\book_dump_html');
if(!file_exists('book_dump_html/index1.html')){
die('Please check did you installed Calibre ? Can you run command ebook-convert in shell ? I cannot access command ebook-convert in your system shell, This script need Calibre to process ebook texts');
}
//Get content
echo "[+] Load Book Contents \n";
$bookcontent = file_get_contents('book_dump_html/index1.html');
$bookcontent_arr = explode(" ",$bookcontent);
//Process Word
echo "[+] Process (".count($bookcontent_arr).") Words \n";
sleep(5);
for ($i=0; $i<=count($bookcontent_arr); $i++) {
if(isset($bookcontent_arr[$i]) AND $bookcontent_arr[$i] != ''){
$word = cleanword($bookcontent_arr[$i]);
//check is stopword ?
$is_stopword = array_search($word, $stopwords);
if($is_stopword != FALSE){
continue; //SKIP
}
//Search Word in Wordwise Dict - https://www.php.net/manual/en/function.array-search.php#116635
$key_found = array_search(strtolower($word) , array_column($wordwise_dict, 'word'));
//echo $key_found;
//print_r($wordwise_dict[$key_found]);
if($key_found != FALSE){
$wordwise = $wordwise_dict[$key_found];
//Check hint_level of current matched word
if($wordwise['hint_level'] > $hint_level) continue; //SKIP all higher hint_level word
echo "[>>] Processing Word: $i \n";
echo "[#] bookcontent_arr[$i]: ".$bookcontent_arr[$i]." \n";
//Replace Original Word with Wordwised
$bookcontent_arr[$i] = preg_replace(
'/('.$word.')/i',
'<ruby>$1<rt>'.$wordwise['short_def'].'</rt></ruby>',
$bookcontent_arr[$i]
);
echo "[#] word: ".$word." \n";
echo "[#] bookcontent_arr REPLACED: ".$bookcontent_arr[$i]." \n";
}
}
}
//Create new book with Wordwised
echo "[+] Create New Book with Wordwised \n";
$new_bookcontent_with_wordwised = implode(' ', $bookcontent_arr);
file_put_contents('book_dump_html/index1.html', $new_bookcontent_with_wordwised);
shell_exec('ebook-convert .\book_dump_html\index1.html "'.$bookpath.'/'.$bookfilename.'-wordwised.epub"');
shell_exec('ebook-convert .\book_dump_html\index1.html "'.$bookpath.'/'.$bookfilename.'-wordwised.azw3"');
shell_exec('ebook-convert .\book_dump_html\index1.html "'.$bookpath.'/'.$bookfilename.'-wordwised.pdf"');
echo "[+] 3 book EPUB, AZW3, PDF with wordwise generated Done !\n";
function deleteDir($dirPath) {
if (! is_dir($dirPath)) {
throw new InvalidArgumentException("$dirPath must be a directory");
}
if (substr($dirPath, strlen($dirPath) - 1, 1) != '/') {
$dirPath .= '/';
}
$files = glob($dirPath . '*', GLOB_MARK);
foreach ($files as $file) {
if (is_dir($file)) {
deleteDir($file);
} else {
unlink($file);
}
}
rmdir($dirPath);
}
function cleanword($word){
$word = strip_tags($word); //strip html tags
$specialchar = array(',','<','>',';','&','*','~','/','"','[',']','#','?','`','–','.',"'",'"','"','!','“','”',':','.'); // recheck when apply this rule, may conflict with standard URL because it trim all char like ? and # and /
$word = str_replace($specialchar,'',$word); //strip special chars
$word = preg_replace("/[^ \w]+/", '', $word); //strip special chars - all non word and non space characters
//$word = strtolower($word); //lowercase URL
return $word;
}
?>