-
Notifications
You must be signed in to change notification settings - Fork 9.4k
/
unicharset_extractor.cpp
121 lines (112 loc) · 4.57 KB
/
unicharset_extractor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
///////////////////////////////////////////////////////////////////////
// File: unicharset_extractor.cpp
// Description: Unicode character/ligature set extractor.
// Author: Thomas Kielbus
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
// Given a list of box files or text files on the command line, this program
// normalizes the text according to command-line options and generates
// a unicharset.
#include <cstdlib>
#include <filesystem>
#include "boxread.h"
#include "commandlineflags.h"
#include "commontraining.h" // CheckSharedLibraryVersion
#include "lang_model_helpers.h"
#include "normstrngs.h"
#include "unicharset.h"
#include "unicharset_training_utils.h"
using namespace tesseract;
static STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
static INT_PARAM_FLAG(norm_mode, 1,
"Normalization mode: 1=Combine graphemes, "
"2=Split graphemes, 3=Pure unicode");
namespace tesseract {
// Helper normalizes and segments the given strings according to norm_mode, and
// adds the segmented parts to unicharset.
static void AddStringsToUnicharset(const std::vector<std::string> &strings, int norm_mode,
UNICHARSET *unicharset) {
for (const auto &string : strings) {
std::vector<std::string> normalized;
if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
static_cast<GraphemeNormMode>(norm_mode),
/*report_errors*/ true, string.c_str(), &normalized)) {
for (const std::string &normed : normalized) {
// normed is a UTF-8 encoded string
if (normed.empty() || IsUTF8Whitespace(normed.c_str())) {
continue;
}
unicharset->unichar_insert(normed.c_str());
}
} else {
tprintf("Normalization failed for string '%s'\n", string.c_str());
}
}
}
static int Main(int argc, char **argv) {
UNICHARSET unicharset;
// Load input files
for (int arg = 1; arg < argc; ++arg) {
std::filesystem::path filePath = argv[arg];
std::string file_data = tesseract::ReadFile(argv[arg]);
if (file_data.empty()) {
continue;
}
std::vector<std::string> texts;
if (filePath.extension() == ".box") {
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
/*box_texts*/ nullptr, /*pages*/ nullptr);
if (!res) {
tprintf("Cannot read box data from '%s'\n", argv[arg]);
return EXIT_FAILURE;
}
} else {
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
texts.clear();
texts = split(file_data, '\n');
}
AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
}
SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, &unicharset);
// Write unicharset file.
if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
} else {
tprintf("Cannot save unicharset file %s\n", FLAGS_output_unicharset.c_str());
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
} // namespace tesseract
int main(int argc, char **argv) {
tesseract::CheckSharedLibraryVersion();
if (argc > 1) {
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
}
if (argc < 2) {
tprintf(
"Usage: %s [--output_unicharset filename] [--norm_mode mode]"
" box_or_text_file [...]\n",
argv[0]);
tprintf("Where mode means:\n");
tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
tprintf("Reads box or plain text files to extract the unicharset.\n");
return EXIT_FAILURE;
}
return tesseract::Main(argc, argv);
}