-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
51 additions
and
112 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
*.binary filter=lfs diff=lfs merge=lfs -crlf | ||
data/lm/trie filter=lfs diff=lfs merge=lfs -crlf | ||
data/lm/vocab.txt filter=lfs diff=lfs merge=lfs -text | ||
data/lm/trie.ctcdecode filter=lfs diff=lfs merge=lfs -text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Git LFS file not shown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,25 @@ | ||
#include <algorithm> | ||
#include <iostream> | ||
#include <string> | ||
using namespace std; | ||
|
||
#include "lm/model.hh" | ||
#include "trie_node.h" | ||
#include "ctcdecode/scorer.h" | ||
#include "fst/fstlib.h" | ||
#include "alphabet.h" | ||
|
||
typedef lm::ngram::QuantArrayTrieModel Model; | ||
|
||
lm::WordIndex GetWordIndex(const Model& model, const std::string& word) { | ||
return model.GetVocabulary().Index(word); | ||
} | ||
|
||
float ScoreWord(const Model& model, lm::WordIndex word_index) { | ||
// We don't need to keep state here as we're scoring the words individually. | ||
Model::State out; | ||
return model.FullScore(model.NullContextState(), word_index, out).prob; | ||
} | ||
|
||
int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* vocab_path, const char* trie_path) { | ||
Alphabet a(alphabet_path); | ||
|
||
lm::ngram::Config config; | ||
config.load_method = util::POPULATE_OR_READ; | ||
Model model(kenlm_path, config); | ||
TrieNode root(a.GetSize()); | ||
|
||
std::ifstream ifs(vocab_path, std::ifstream::in | std::ios::binary); | ||
if (!ifs) { | ||
std::cerr << "unable to open vocabulary file " << vocab_path << std::endl; | ||
return -1; | ||
} | ||
|
||
std::ofstream ofs(trie_path); | ||
if (!ofs) { | ||
std::cerr << "unable to open output file " << trie_path << std::endl; | ||
return -1; | ||
} | ||
|
||
std::string word; | ||
while (ifs >> word) { | ||
lm::WordIndex word_index = GetWordIndex(model, word); | ||
float unigram_score = ScoreWord(model, word_index); | ||
root.Insert(word, | ||
[&a](const std::string& c) { | ||
return a.LabelFromString(c); | ||
}, | ||
word_index, unigram_score); | ||
} | ||
using namespace std; | ||
|
||
root.WriteToStream(ofs); | ||
int generate_trie(const char* alphabet_path, const char* kenlm_path, const char* trie_path) { | ||
Alphabet alphabet(alphabet_path); | ||
Scorer scorer(0.0, 0.0, kenlm_path, "", alphabet); | ||
scorer.save_dictionary(trie_path); | ||
return 0; | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
if (argc != 5) { | ||
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <vocabulary> <trie_path>" << std::endl; | ||
if (argc != 4) { | ||
std::cerr << "Usage: " << argv[0] << " <alphabet> <lm_model> <trie_path>" << std::endl; | ||
return -1; | ||
} | ||
|
||
return generate_trie(argv[1], argv[2], argv[3], argv[4]); | ||
return generate_trie(argv[1], argv[2], argv[3]); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,24 @@ | ||
#include <algorithm> | ||
#include <iostream> | ||
#include <memory> | ||
#include <string> | ||
|
||
#include "ctcdecode/scorer.h" | ||
#include "fst/fstlib.h" | ||
#include "alphabet.h" | ||
#include "trie_node.h" | ||
|
||
using namespace std; | ||
|
||
|
||
int main(int argc, char** argv) | ||
{ | ||
const char* trie_path = argv[1]; | ||
const char* alphabet_path = argv[2]; | ||
const char* kenlm_path = argv[1]; | ||
const char* trie_path = argv[2]; | ||
const char* alphabet_path = argv[3]; | ||
|
||
printf("Loading trie(%s) and alphabet(%s)\n", trie_path, alphabet_path); | ||
|
||
Alphabet alphabet_ = Alphabet(alphabet_path); | ||
TrieNode *trieRoot_; | ||
|
||
std::ifstream in(trie_path, std::ios::in | std::ios::binary); | ||
TrieNode::ReadFromStream(in, trieRoot_, alphabet_.GetSize()); | ||
Alphabet alphabet(alphabet_path); | ||
Scorer scorer(0.0, 0.0, kenlm_path, trie_path, alphabet); | ||
|
||
return 0; | ||
} |