From 0e458e9e820835884ade573a3cd4c69982b25a2a Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Sun, 26 Jul 2015 12:39:38 +0300 Subject: [PATCH 1/8] Add missing space in message --- parser/lstm-parse.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index 44b95a8..1eb0548 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -531,7 +531,7 @@ int main(int argc, char** argv) { if (conf.count("words")) { pretrained[kUNK] = vector(PRETRAINED_DIM, 0); - cerr << "Loading from " << conf["words"].as() << " with" << PRETRAINED_DIM << " dimensions\n"; + cerr << "Loading from " << conf["words"].as() << " with " << PRETRAINED_DIM << " dimensions\n"; ifstream in(conf["words"].as().c_str()); string line; getline(in, line); From fef8696c75db00bb09d6fbaa53b54cda7c360522 Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 14:02:32 +0300 Subject: [PATCH 2/8] Fix clab/lstm-parser#4: support reading zipped word vectors --- CMakeLists.txt | 2 +- parser/lstm-parse.cc | 48 ++++++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1687a20..12ac42e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ if(DEFINED ENV{BOOST_ROOT}) set(Boost_NO_SYSTEM_PATHS ON) endif() set(Boost_REALPATH ON) -find_package(Boost COMPONENTS program_options serialization REQUIRED) +find_package(Boost COMPONENTS program_options serialization iostreams REQUIRED) include_directories(${Boost_INCLUDE_DIR}) set(LIBS ${LIBS} ${Boost_LIBRARIES}) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index 1eb0548..53a77d9 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -15,8 +15,11 @@ #include #include +#include #include #include +#include +#include #include #include "cnn/training.h" @@ -481,6 +484,21 @@ void output_conll(const vector& sentence, const vector& pos, cout << endl; } +void init_pretrained(istream &in) { + string line; + vector v(PRETRAINED_DIM, 0); + string word; + while (getline(in, line)) { + if (word.empty() && line.find('.') == std::string::npos) + continue; // first line contains vocabulary size and dimensions + istringstream lin(line); + lin >> word; + for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i]; + unsigned id = corpus.get_or_add_word(word); + pretrained[id] = v; + } +} + int main(int argc, char** argv) { cnn::Initialize(argc, argv); @@ -525,24 +543,24 @@ int main(int argc, char** argv) { const string fname = os.str(); cerr << "Writing parameters to file: " << fname << endl; bool softlinkCreated = false; - corpus.load_correct_actions(conf["training_data"].as()); + corpus.load_correct_actions(conf["training_data"].as()); const unsigned kUNK = corpus.get_or_add_word(cpyp::Corpus::UNK); kROOT_SYMBOL = corpus.get_or_add_word(ROOT_SYMBOL); if (conf.count("words")) { pretrained[kUNK] = vector(PRETRAINED_DIM, 0); - cerr << "Loading from " << conf["words"].as() << " with " << PRETRAINED_DIM << " dimensions\n"; - ifstream in(conf["words"].as().c_str()); - string line; - getline(in, line); - vector v(PRETRAINED_DIM, 0); - string word; - while (getline(in, line)) { - istringstream lin(line); - lin >> word; - for (unsigned i = 0; i < PRETRAINED_DIM; ++i) lin >> v[i]; - unsigned id = corpus.get_or_add_word(word); - pretrained[id] = v; + const string& words_fname = conf["words"].as(); + cerr << "Loading from " << words_fname << " with " << PRETRAINED_DIM << " dimensions\n"; + if (boost::algorithm::ends_with(words_fname, ".gz")) { + ifstream file(words_fname.c_str(), ios_base::in | ios_base::binary); + boost::iostreams::filtering_streambuf zip; + zip.push(boost::iostreams::zlib_decompressor()); + zip.push(file); + istream in(&zip); + init_pretrained(in); + } else { + ifstream in(words_fname.c_str()); + init_pretrained(in); // read as normal text } } @@ -611,7 +629,7 @@ int main(int argc, char** argv) { for (auto& w : tsentence) if (singletons.count(w) && cnn::rand01() < unk_prob) w = kUNK; } - const vector& sentencePos=corpus.sentencesPos[order[si]]; + const vector& sentencePos=corpus.sentencesPos[order[si]]; const vector& actions=corpus.correct_act_sent[order[si]]; ComputationGraph hg; parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,actions,corpus.actions,corpus.intToWords,&right); @@ -644,7 +662,7 @@ int main(int argc, char** argv) { auto t_start = std::chrono::high_resolution_clock::now(); for (unsigned sii = 0; sii < dev_size; ++sii) { const vector& sentence=corpus.sentencesDev[sii]; - const vector& sentencePos=corpus.sentencesPosDev[sii]; + const vector& sentencePos=corpus.sentencesPosDev[sii]; const vector& actions=corpus.correct_act_sentDev[sii]; vector tsentence=sentence; for (auto& w : tsentence) From db29f8557353428f79a84304384d0ebddc89bedc Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 14:07:03 +0300 Subject: [PATCH 3/8] Remove unused imports --- parser/lstm-parse.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index 53a77d9..5b4f695 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -2,16 +2,11 @@ #include #include #include -#include -#include -#include #include -#include #include #include -#include #include #include @@ -25,9 +20,7 @@ #include "cnn/training.h" #include "cnn/cnn.h" #include "cnn/expr.h" -#include "cnn/nodes.h" #include "cnn/lstm.h" -#include "cnn/rnn.h" #include "c2.h" cpyp::Corpus corpus; From f3b4cca6e7a302c308a10cdd089f68da067a543a Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 14:08:28 +0300 Subject: [PATCH 4/8] Fix indentation and trailing whitespace --- parser/lstm-parse.cc | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index 5b4f695..f30651b 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -439,19 +439,19 @@ unsigned compute_correct(const map& ref, const map& hyp, unsig } void output_conll(const vector& sentence, const vector& pos, - const vector& sentenceUnkStrings, - const map& intToWords, - const map& intToPos, + const vector& sentenceUnkStrings, + const map& intToWords, + const map& intToPos, const map& hyp, const map& rel_hyp) { for (unsigned i = 0; i < (sentence.size()-1); ++i) { auto index = i + 1; - assert(i < sentenceUnkStrings.size() && + assert(i < sentenceUnkStrings.size() && ((sentence[i] == corpus.get_or_add_word(cpyp::Corpus::UNK) && sentenceUnkStrings[i].size() > 0) || (sentence[i] != corpus.get_or_add_word(cpyp::Corpus::UNK) && sentenceUnkStrings[i].size() == 0 && intToWords.find(sentence[i]) != intToWords.end()))); - string wit = (sentenceUnkStrings[i].size() > 0)? + string wit = (sentenceUnkStrings[i].size() > 0)? sentenceUnkStrings[i] : intToWords.find(sentence[i])->second; auto pit = intToPos.find(pos[i]); assert(hyp.find(i) != hyp.end()); @@ -463,10 +463,10 @@ void output_conll(const vector& sentence, const vector& pos, size_t first_char_in_rel = hyp_rel.find('(') + 1; size_t last_char_in_rel = hyp_rel.rfind(')') - 1; hyp_rel = hyp_rel.substr(first_char_in_rel, last_char_in_rel - first_char_in_rel + 1); - cout << index << '\t' // 1. ID + cout << index << '\t' // 1. ID << wit << '\t' // 2. FORM - << "_" << '\t' // 3. LEMMA - << "_" << '\t' // 4. CPOSTAG + << "_" << '\t' // 3. LEMMA + << "_" << '\t' // 4. CPOSTAG << pit->second << '\t' // 5. POSTAG << "_" << '\t' // 6. FEATS << hyp_head << '\t' // 7. HEAD @@ -496,7 +496,7 @@ void init_pretrained(istream &in) { int main(int argc, char** argv) { cnn::Initialize(argc, argv); - cerr << "COMMAND:"; + cerr << "COMMAND:"; for (unsigned i = 0; i < static_cast(argc); ++i) cerr << ' ' << argv[i]; cerr << endl; unsigned status_every_i_iterations = 100; @@ -622,8 +622,8 @@ int main(int argc, char** argv) { for (auto& w : tsentence) if (singletons.count(w) && cnn::rand01() < unk_prob) w = kUNK; } - const vector& sentencePos=corpus.sentencesPos[order[si]]; - const vector& actions=corpus.correct_act_sent[order[si]]; + const vector& sentencePos=corpus.sentencesPos[order[si]]; + const vector& actions=corpus.correct_act_sent[order[si]]; ComputationGraph hg; parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,actions,corpus.actions,corpus.intToWords,&right); double lp = as_scalar(hg.incremental_forward()); @@ -655,15 +655,15 @@ int main(int argc, char** argv) { auto t_start = std::chrono::high_resolution_clock::now(); for (unsigned sii = 0; sii < dev_size; ++sii) { const vector& sentence=corpus.sentencesDev[sii]; - const vector& sentencePos=corpus.sentencesPosDev[sii]; - const vector& actions=corpus.correct_act_sentDev[sii]; + const vector& sentencePos=corpus.sentencesPosDev[sii]; + const vector& actions=corpus.correct_act_sentDev[sii]; vector tsentence=sentence; for (auto& w : tsentence) if (training_vocab.count(w) == 0) w = kUNK; ComputationGraph hg; - vector pred = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,&right); - double lp = 0; + vector pred = parser.log_prob_parser(&hg,sentence,tsentence,sentencePos,vector(),corpus.actions,corpus.intToWords,&right); + double lp = 0; llh -= lp; trs += actions.size(); map ref = parser.compute_heads(sentence.size(), actions, corpus.actions); @@ -683,9 +683,9 @@ int main(int argc, char** argv) { // easier to refer to it in a shell script. if (!softlinkCreated) { string softlink = " latest_model"; - if (system((string("rm -f ") + softlink).c_str()) == 0 && + if (system((string("rm -f ") + softlink).c_str()) == 0 && system((string("ln -s ") + fname + softlink).c_str()) == 0) { - cerr << "Created " << softlink << " as a soft link to " << fname + cerr << "Created " << softlink << " as a soft link to " << fname << " for convenience." << endl; } softlinkCreated = true; @@ -704,8 +704,8 @@ int main(int argc, char** argv) { unsigned corpus_size = corpus.nsentencesDev; for (unsigned sii = 0; sii < corpus_size; ++sii) { const vector& sentence=corpus.sentencesDev[sii]; - const vector& sentencePos=corpus.sentencesPosDev[sii]; - const vector& sentenceUnkStr=corpus.sentencesStrDev[sii]; + const vector& sentencePos=corpus.sentencesPosDev[sii]; + const vector& sentenceUnkStr=corpus.sentencesStrDev[sii]; const vector& actions=corpus.correct_act_sentDev[sii]; vector tsentence=sentence; for (auto& w : tsentence) From 5f93b46db95f0e7fbb399a89483bb57050baad4b Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 14:46:59 +0300 Subject: [PATCH 5/8] clab/lstm-parser#3: allow limiting number of iterations --- parser/lstm-parse.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index f30651b..adde5c9 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -70,6 +70,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("rel_dim", po::value()->default_value(10), "relation dimension") ("lstm_input_dim", po::value()->default_value(60), "LSTM input dimension") ("train,t", "Should training be run?") + ("maxit,M", po::value()->default_value(8000), "Maximum number of training iterations") ("words,w", po::value(), "Pretrained word embeddings") ("help,h", "Help"); po::options_description dcmdline_options; @@ -522,6 +523,8 @@ int main(int argc, char** argv) { } const double unk_prob = conf["unk_prob"].as(); assert(unk_prob >= 0.); assert(unk_prob <= 1.); + const unsigned maxit = conf["maxit"].as(); + cerr << "Maximum number of iterations: " << maxit << "\n"; ostringstream os; os << "parser_" << (USE_POS ? "pos" : "nopos") << '_' << LAYERS @@ -603,11 +606,10 @@ int main(int argc, char** argv) { double right = 0; double llh = 0; bool first = true; - int iter = -1; + unsigned iter = 0; time_t time_start = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); cerr << "TRAINING STARTED AT: " << put_time(localtime(&time_start), "%c %Z") << endl; - while(!requested_stop) { - ++iter; + while(!requested_stop && iter < maxit) { for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) { if (si == corpus.nsentences) { si = 0; @@ -692,6 +694,10 @@ int main(int argc, char** argv) { } } } + ++iter; + } + if (iter >= maxit) { + cerr << "\nMaximum number of iterations reached (" << iter << "), terminating optimization...\n"; } } // should do training? if (true) { // do test evaluation From e8432811caf8472ee0934c36ff9a36fe96dd30af Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 15:04:49 +0300 Subject: [PATCH 6/8] Fix clab/lstm-parser#3: allow limiting optimization by dev uas tolerance --- parser/lstm-parse.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index adde5c9..e6d2f69 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -71,6 +71,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("lstm_input_dim", po::value()->default_value(60), "LSTM input dimension") ("train,t", "Should training be run?") ("maxit,M", po::value()->default_value(8000), "Maximum number of training iterations") + ("tolerance", po::value()->default_value(0.0), "Tolerance on dev uas for stopping training") ("words,w", po::value(), "Pretrained word embeddings") ("help,h", "Help"); po::options_description dcmdline_options; @@ -525,6 +526,8 @@ int main(int argc, char** argv) { assert(unk_prob >= 0.); assert(unk_prob <= 1.); const unsigned maxit = conf["maxit"].as(); cerr << "Maximum number of iterations: " << maxit << "\n"; + const double tolerance = conf["tolerance"].as(); + cerr << "Optimization tolerance: " << tolerance << "\n"; ostringstream os; os << "parser_" << (USE_POS ? "pos" : "nopos") << '_' << LAYERS @@ -607,9 +610,12 @@ int main(int argc, char** argv) { double llh = 0; bool first = true; unsigned iter = 0; + double uas = -1; + double prev_uas = -1; time_t time_start = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); cerr << "TRAINING STARTED AT: " << put_time(localtime(&time_start), "%c %Z") << endl; - while(!requested_stop && iter < maxit) { + while(!requested_stop && iter < maxit && + (uas < 0 || prev_uas < 0 || abs(prev_uas - uas) > tolerance)) { for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) { if (si == corpus.nsentences) { si = 0; @@ -675,7 +681,9 @@ int main(int argc, char** argv) { total_heads += sentence.size() - 1; } auto t_end = std::chrono::high_resolution_clock::now(); - cerr << " **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads / total_heads) << "\t[" << dev_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; + prev_uas = uas; + uas = correct_heads / total_heads; + cerr << " **dev (iter=" << iter << " epoch=" << (tot_seen / corpus.nsentences) << ")\tllh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << uas << "\t[" << dev_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; if (correct_heads > best_correct_heads) { best_correct_heads = correct_heads; ofstream out(fname); @@ -698,6 +706,8 @@ int main(int argc, char** argv) { } if (iter >= maxit) { cerr << "\nMaximum number of iterations reached (" << iter << "), terminating optimization...\n"; + } else if (!requested_stop) { + cerr << "\nScore tolerance reached (" << tolerance << "), terminating optimization...\n"; } } // should do training? if (true) { // do test evaluation From 0b39d5702927029977f69ae17d91ef19c371acfd Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Tue, 28 Jul 2015 15:59:39 +0300 Subject: [PATCH 7/8] Fix clab/lstm-parser#2: calculate and print las on test --- parser/lstm-parse.cc | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index e6d2f69..b96732e 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -428,7 +428,8 @@ void signal_callback_handler(int /* signum */) { requested_stop = true; } -unsigned compute_correct(const map& ref, const map& hyp, unsigned len) { +template +unsigned compute_correct(const map& ref, const map& hyp, unsigned len) { unsigned res = 0; for (unsigned i = 0; i < len; ++i) { auto ri = ref.find(i); @@ -440,6 +441,24 @@ unsigned compute_correct(const map& ref, const map& hyp, unsig return res; } +template +unsigned compute_correct(const map& ref1, const map& hyp1, + const map& ref2, const map& hyp2, unsigned len) { + unsigned res = 0; + for (unsigned i = 0; i < len; ++i) { + auto r1 = ref1.find(i); + auto h1 = hyp1.find(i); + auto r2 = ref2.find(i); + auto h2 = hyp2.find(i); + assert(r1 != ref1.end()); + assert(h1 != hyp1.end()); + assert(r2 != ref2.end()); + assert(h2 != hyp2.end()); + if (r1->second == h1->second && r2->second == h2->second) ++res; + } + return res; +} + void output_conll(const vector& sentence, const vector& pos, const vector& sentenceUnkStrings, const map& intToWords, @@ -714,7 +733,8 @@ int main(int argc, char** argv) { double llh = 0; double trs = 0; double right = 0; - double correct_heads = 0; + double correct_heads_unlabeled = 0; + double correct_heads_labeled = 0; double total_heads = 0; auto t_start = std::chrono::high_resolution_clock::now(); unsigned corpus_size = corpus.nsentencesDev; @@ -736,11 +756,12 @@ int main(int argc, char** argv) { map ref = parser.compute_heads(sentence.size(), actions, corpus.actions, &rel_ref); map hyp = parser.compute_heads(sentence.size(), pred, corpus.actions, &rel_hyp); output_conll(sentence, sentencePos, sentenceUnkStr, corpus.intToWords, corpus.intToPos, hyp, rel_hyp); - correct_heads += compute_correct(ref, hyp, sentence.size() - 1); + correct_heads_unlabeled += compute_correct(ref, hyp, sentence.size() - 1); + correct_heads_labeled += compute_correct(ref, hyp, rel_ref, rel_hyp, sentence.size() - 1); total_heads += sentence.size() - 1; } auto t_end = std::chrono::high_resolution_clock::now(); - cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads / total_heads) << "\t[" << corpus_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; + cerr << "TEST llh=" << llh << " ppl: " << exp(llh / trs) << " err: " << (trs - right) / trs << " uas: " << (correct_heads_unlabeled / total_heads) << " las: " << (correct_heads_labeled / total_heads) << "\t[" << corpus_size << " sents in " << std::chrono::duration(t_end-t_start).count() << " ms]" << endl; } for (unsigned i = 0; i < corpus.actions.size(); ++i) { //cerr << corpus.actions[i] << '\t' << parser.p_r->values[i].transpose() << endl; From 090e81127d9b48eb7ba46c16c1c1ef39199149dd Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Sun, 9 Aug 2015 11:29:29 +0300 Subject: [PATCH 8/8] Change default tolerance to -1, meaning no tolerance-based stopping --- parser/lstm-parse.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parser/lstm-parse.cc b/parser/lstm-parse.cc index b96732e..a3816ce 100644 --- a/parser/lstm-parse.cc +++ b/parser/lstm-parse.cc @@ -71,7 +71,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("lstm_input_dim", po::value()->default_value(60), "LSTM input dimension") ("train,t", "Should training be run?") ("maxit,M", po::value()->default_value(8000), "Maximum number of training iterations") - ("tolerance", po::value()->default_value(0.0), "Tolerance on dev uas for stopping training") + ("tolerance", po::value()->default_value(-1.0), "Tolerance on dev uas for stopping training") ("words,w", po::value(), "Pretrained word embeddings") ("help,h", "Help"); po::options_description dcmdline_options; @@ -546,7 +546,9 @@ int main(int argc, char** argv) { const unsigned maxit = conf["maxit"].as(); cerr << "Maximum number of iterations: " << maxit << "\n"; const double tolerance = conf["tolerance"].as(); - cerr << "Optimization tolerance: " << tolerance << "\n"; + if (tolerance > 0.0) { + cerr << "Optimization tolerance: " << tolerance << "\n"; + } ostringstream os; os << "parser_" << (USE_POS ? "pos" : "nopos") << '_' << LAYERS @@ -634,7 +636,7 @@ int main(int argc, char** argv) { time_t time_start = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); cerr << "TRAINING STARTED AT: " << put_time(localtime(&time_start), "%c %Z") << endl; while(!requested_stop && iter < maxit && - (uas < 0 || prev_uas < 0 || abs(prev_uas - uas) > tolerance)) { + (tolerance < 0 || uas < 0 || prev_uas < 0 || abs(prev_uas - uas) > tolerance)) { for (unsigned sii = 0; sii < status_every_i_iterations; ++sii) { if (si == corpus.nsentences) { si = 0;