-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathDicGlobals.pm
150 lines (126 loc) · 8.79 KB
/
DicGlobals.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/perl
package DicGlobals;
use strict;
use warnings;
use Exporter;
our @ISA = ('Exporter');
our @EXPORT = (
'$BaseDir',
'$cycle_dotprinter',
'$cycles_per_dot',
'$DocType',
'$DoNotFilterDocType',
'$DumperSuffix',
'$FileName',
'$ForceConvertBlockquote2Div',
'$ForceConvertNumberedSequencesToChar',
'$FullPath',
'$isCodeImageBase64',
'$isConvertColorNamestoHexCodePoints',
'$isCreateMDict',
'$isCreatePocketbookDictionary',
'$isCreateStardictDictionary',
'$isHandleMobiDictionary',
'$isTestingOn',
'$KindleUnpackLibFolder',
'$lastline_xdxf',
'$lastline_xml',
'$LocalPath',
'$OperatingSystem',
'$SameTypeSequence',
'$updateSameTypeSequence',
'$UseXMLTidy',
'%EntityConversion',
'@CleanHTMLTags',
'@ExcludedHTMLTags',
'@xdxf_start',
'@xml_start',
);
###########################################
### Beginning of manual control input ###
###########################################
# Last filename will be used.
# Give the filename relative to the base directory defined in $BaseDir.
# However, when an argument is given, it will supercede the last filename
our $FileName;
# Examples given:
$FileName = "dict/Oxford English Dictionary 2nd Ed/Oxford English Dictionary 2nd Ed.xdxf";
$FileName = "dict/stardict-Webster_s_Unabridged_3-2.4.2/Webster_s_Unabridged_3.ifo";
# $BaseDir is the directory where converter.exe and the language folders reside.
# Typically the language folders are named by two letters, e.g. english is named 'en'.
# In each folder should be a collates.txt, keyboard.txt and morphems.txt file.
our $BaseDir="/home/mark/Downloads/PocketbookDic";
our $LocalPath = join('', $FileName=~ m~^(.+?)/[^/]+$~); # Default value
our $FullPath = "$BaseDir/$LocalPath"; # Default value
# $KindleUnpackLibFolder is the folder in which kindleunpack.py resides.
# You can download KindleUnpack using http with: git clone https://github.com/kevinhendricks/KindleUnpack
# or using ssh with: git clone git@github.com:kevinhendricks/KindleUnpack.git
# Use absolute path beginning with either '/' (root) or '~'(home) on Linux. On Windows use whatever works.
our $KindleUnpackLibFolder="/home/mark/git/KindleUnpack/lib";
our $DumperSuffix = ".Dumper.txt"; # Has to be declared before any call to storeHash or retrieveHash. Otherwise it is undefined, although no error is given.
# Controls for debugging.
our $isTestingOn = 0; # Toggles intermediary output of xdxf-array.
our ($cycle_dotprinter, $cycles_per_dot) = (0 , 300); # A green dot is printed achter $cycles_per_dot ar's have been processed.
# Controls for Stardict dictionary creation and Koreader stardict compatabiltiy
our $isCreateStardictDictionary = 0; # Turns on Stardict text and binary dictionary creation.
# Same Type Seqence is the initial value of the Stardict variable set in the ifo-file.
# "h" means html-dictionary. "m" means text.
# The xdxf-file will be filtered for � values and converted to unicode if set at "m"
our $SameTypeSequence = "h"; # Either "h" or "m" or "x".
our $updateSameTypeSequence = 1; # If the Stardict files give a sametypesequence value, update the initial value.
our $isConvertColorNamestoHexCodePoints = 1; # Converting takes time.
# Controls for Pocketbook conversion
our $isCreatePocketbookDictionary = 1; # Controls conversion to Pocketbook Dictionary dic-format
# Force conversion of numbered sequences to characters.
our $ForceConvertNumberedSequencesToChar = 1;
# Nouveau Littré uses doctype symbols, which should be converted before further processing.
our $DoNotFilterDocType = 1;
# Controls for Mobi dictionary handling
our $isHandleMobiDictionary = 1 ;
# Create mdict dictionary
our $isCreateMDict = 0;
# Controls for recoding or deleting images and sounds.
our $isCodeImageBase64 = 0; # Some dictionaries contain images. Encoding them as Base64 allows coding them inline. Only implemented with convertHTML2XDXF.
our $ForceConvertBlockquote2Div = 0;
our $UseXMLTidy = 0; # Enables or disables the use of the subroutine tidyXMLArray. Still experimental, so disable.
#########################################################
### End of manual control input ####
### (Excluding doctype html entities. See below. ) ####
#########################################################
# Determine operating system.
our $OperatingSystem = "$^O";
# As NouveauLittre showed a rather big problem with named entities, I wrote a special filter
# Here is the place to insert your DOCTYPE string.
# Remember to place it between quotes '..' and finish the line with a semicolon ;
# Last Doctype will be used.
# To omit the filter place an empty DocType string at the end:
# $DocType = '';
our ($DocType,%EntityConversion);
$DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[<!ENTITY ns "♦"><!ENTITY os "•"><!ENTITY oo "›"><!ENTITY co "‹"><!ENTITY a "a"><!ENTITY â "ɑ"><!ENTITY an "ɑ̃"><!ENTITY b "b"><!ENTITY d "ɗ"><!ENTITY e "ə"><!ENTITY é "e"><!ENTITY è "ɛ"><!ENTITY in "ɛ̃"><!ENTITY f "f"><!ENTITY g "ɡ"><!ENTITY h "h"><!ENTITY h2 "'"><!ENTITY i "i"><!ENTITY j "J"><!ENTITY k "k"><!ENTITY l "l"><!ENTITY m "m"><!ENTITY n "n"><!ENTITY gn "ɲ"><!ENTITY ing "ɳ"><!ENTITY o "o"><!ENTITY o2 "ɔ"><!ENTITY oe "ɶ"><!ENTITY on "ɔ̃"><!ENTITY eu "ɸ"><!ENTITY un "ɶ̃"><!ENTITY p "p"><!ENTITY r "ʀ"><!ENTITY s "s"><!ENTITY ch "ʃ"><!ENTITY t "t"><!ENTITY u "ɥ"><!ENTITY ou "u"><!ENTITY v "v"><!ENTITY w "w"><!ENTITY x "x"><!ENTITY y "y"><!ENTITY z "z"><!ENTITY Z "ʒ">]><html xml:lang="fr" xmlns="http://www.w3.org/1999/xhtml"><head><title></title></head><body>';
if( $DoNotFilterDocType ){ $DocType = ''; }
our @CleanHTMLTags = ( "<!--...-->", "<!DOCTYPE>", "<a>", "<abbr>", "<acronym>", "<address>", "<applet>", "<area>", "<aside>", "<audio>", "<b>", "<base>", "<basefont>", "<bdi>", "<bdo>", "<big>", "<blockquote>", "<body>", "<br>", "<button>", "<canvas>", "<caption>", "<center>", "<cite>", "<code>", "<col>", "<colgroup>", "<data>", "<datalist>", "<dd>", "<del>", "<details>", "<dfn>", "<dialog>", "<dir>", "<div>", "<dl>", "<dt>", "<em>", "<embed>", "<fieldset>", "<figcaption>", "<figure>", "<font>", "<footer>", "<form>", "<frame>", "<frameset>", "<h1>", "<header>", "<hr>", "<html>", "<i>", "<iframe>", "<img>", "<input>", "<ins>", "<kbd>", "<label>", "<legend>", "<li>", "<link>", "<main>", "<map>", "<mark>", "<meta>", "<meter>", "<nav>", "<noframes>", "<noscript>", "<object>", "<ol>", "<optgroup>", "<option>", "<output>", "<p>", "<param>", "<picture>", "<pre>", "<progress>", "<q>", "<rp>", "<rt>", "<ruby>", "<s>", "<samp>", "<script>", "<section>", "<select>", "<small>", "<source>", "<span>", "<strike>", "<strong>", "<style>", "<sub>", "<summary>", "<sup>", "<svg>", "<table>", "<tbody>", "<td>", "<template>", "<textarea>", "<tfoot>", "<th>", "<thead>", "<time>", "<title>", "<tr>", "<track>", "<tt>", "<u>", "<ul>", "<var>", "<video>", "<wbr>" );
our @ExcludedHTMLTags = ( "<head>", "<article>", );
our @xdxf_start = (
'<?xml version="1.0" encoding="UTF-8" ?>'."\n",
'<xdxf lang_from="" lang_to="" format="visual">'."\n",
'<full_name></full_name>'."\n",
'<description>'."\n",
'<date></date>'."\n",
'Created with pocketbookdic.pl'."\n",
'</description>'."\n");
our $lastline_xdxf = "</xdxf>\n";
our @xml_start = (
'<?xml version="1.0" encoding="UTF-8" ?>'."\n", #[0]
'<stardict xmlns:xi="http://www.w3.org/2003/XInclude">'."\n", #[1]
'<info>'."\n", #[2]
'<version>2.4.2</version>'."\n", #[3]
'<bookname></bookname>'."\n", #[4]
'<author>pocketbookdic.pl</author>'."\n", #[5]
'<email>rather_open_issue@github.com</email>'."\n", #[6]
'<website>https://github.com/Markismus/PocketBookDic</website>'."\n", #[7]
'<description></description>'."\n", #[8]
'<date>'.gmtime().'</date>'."\n", #[9]
# '<dicttype></dicttype>'."\n",
'</info>'."\n"); #[10]
our $lastline_xml = "</stardict>\n";
1;