From c3430fd5702a78ce16476a7a28661038ae47ead2 Mon Sep 17 00:00:00 2001 From: WilliamTambellini Date: Sat, 22 Dec 2018 18:09:36 -0800 Subject: [PATCH 01/13] Add CMake script Add CMake script to buid a static lib cld3 and unittests. No dependancy with Chrome. --- .gitignore | 1 + CMakeLists.txt | 69 ++++++++++++++++++++++++ misc/myprotobuf.cmake | 58 ++++++++++++++++++++ src/language_identifier_features_test.cc | 1 + 4 files changed, 129 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 misc/myprotobuf.cmake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..378eac2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..732a8ae --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,69 @@ +# This cmake scripts only builds a static cld3 lib and the unittests. + +project(cld3) + +# Old versions of cmake dont search/find protobuf lite +cmake_minimum_required(VERSION 3.9) + +find_package(Protobuf REQUIRED) +message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}") +message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}") +message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.") +message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}") +message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so + +# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir. +# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h. +# So *.pb.h must be output to cld_3/protos. +# For that, let's use a custom my_protobuf_generate_cpp: +include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake) +my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto) +message(STATUS "PROTO_HDRS= ${PROTO_HDRS}") + +add_definitions(-fPIC) # Position Independant Code +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-std=c++11) # Needed for std::to_string(), ... + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers + +add_library(${PROJECT_NAME} + ${PROTO_SRCS} ${PROTO_HDRS} + src/base.cc + src/embedding_feature_extractor.cc + src/embedding_network.cc + src/feature_extractor.cc + src/feature_extractor.h + src/feature_types.cc + src/fml_parser.cc + src/language_identifier_features.cc + src/lang_id_nn_params.cc + src/nnet_language_identifier.cc + src/registry.cc + src/relevant_script_feature.cc + src/sentence_features.cc + src/task_context.cc + src/task_context_params.cc + src/unicodetext.cc + src/utils.cc + src/workspace.cc + + src/script_span/generated_entities.cc + src/script_span/getonescriptspan.cc + src/script_span/getonescriptspan.h + src/script_span/getonescriptspan_test.cc + src/script_span/utf8statetable.cc + src/script_span/offsetmap.cc + src/script_span/text_processing.cc + src/script_span/text_processing.h + src/script_span/fixunicodevalue.cc + ) + +# unit tests exec: +add_executable(language_identifier_main src/language_identifier_main.cc) +target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES}) + +add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc) +target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES}) + +add_executable(language_identifier_features_test src/language_identifier_features_test.cc) +target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES}) diff --git a/misc/myprotobuf.cmake b/misc/myprotobuf.cmake new file mode 100644 index 0000000..c8d4242 --- /dev/null +++ b/misc/myprotobuf.cmake @@ -0,0 +1,58 @@ +# Special PROTOBUF_GENERATE_CPP which allows to set the output folder: +# From https://stackoverflow.com/users/1600278/akira-okumura + +function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS) + foreach(DIR ${PROTOBUF_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + set(${HDRS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc") + list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h") + + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH}) + + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() \ No newline at end of file diff --git a/src/language_identifier_features_test.cc b/src/language_identifier_features_test.cc index 5835c86..05fb86c 100644 --- a/src/language_identifier_features_test.cc +++ b/src/language_identifier_features_test.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include #include +#include #include "base.h" #include "feature_extractor.h" From 86c4d70aaa89a478aff777d188003cc807d20dbe Mon Sep 17 00:00:00 2001 From: WilliamTambellini Date: Sat, 22 Dec 2018 18:09:36 -0800 Subject: [PATCH 02/13] Add CMake script Add CMake script to buid a static lib cld3 and unittests. No dependancy with Chrome. --- .gitignore | 1 + CMakeLists.txt | 68 ++++++++++++++++++++++++ misc/myprotobuf.cmake | 58 ++++++++++++++++++++ src/language_identifier_features_test.cc | 1 + 4 files changed, 128 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 misc/myprotobuf.cmake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..378eac2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..97ca1bd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,68 @@ +# This cmake scripts only builds a static cld3 lib and the unittests. +project(cld3) + +# Old versions of cmake dont search/find protobuf lite +cmake_minimum_required(VERSION 3.9) + +find_package(Protobuf REQUIRED) +message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}") +message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}") +message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.") +message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}") +message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so + +# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir. +# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h. +# So *.pb.h must be output to cld_3/protos. +# For that, let's use a custom my_protobuf_generate_cpp: +include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake) +my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto) +message(STATUS "PROTO_HDRS= ${PROTO_HDRS}") + +add_definitions(-fPIC) # Position Independant Code +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) +add_definitions(-std=c++11) # Needed for std::to_string(), ... + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers + +add_library(${PROJECT_NAME} + ${PROTO_SRCS} ${PROTO_HDRS} + src/base.cc + src/embedding_feature_extractor.cc + src/embedding_network.cc + src/feature_extractor.cc + src/feature_extractor.h + src/feature_types.cc + src/fml_parser.cc + src/language_identifier_features.cc + src/lang_id_nn_params.cc + src/nnet_language_identifier.cc + src/registry.cc + src/relevant_script_feature.cc + src/sentence_features.cc + src/task_context.cc + src/task_context_params.cc + src/unicodetext.cc + src/utils.cc + src/workspace.cc + + src/script_span/generated_entities.cc + src/script_span/getonescriptspan.cc + src/script_span/getonescriptspan.h + src/script_span/getonescriptspan_test.cc + src/script_span/utf8statetable.cc + src/script_span/offsetmap.cc + src/script_span/text_processing.cc + src/script_span/text_processing.h + src/script_span/fixunicodevalue.cc + ) + +# unit tests exec: +add_executable(language_identifier_main src/language_identifier_main.cc) +target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES}) + +add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc) +target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES}) + +add_executable(language_identifier_features_test src/language_identifier_features_test.cc) +target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES}) diff --git a/misc/myprotobuf.cmake b/misc/myprotobuf.cmake new file mode 100644 index 0000000..c8d4242 --- /dev/null +++ b/misc/myprotobuf.cmake @@ -0,0 +1,58 @@ +# Special PROTOBUF_GENERATE_CPP which allows to set the output folder: +# From https://stackoverflow.com/users/1600278/akira-okumura + +function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS) + foreach(DIR ${PROTOBUF_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + set(${HDRS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc") + list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h") + + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH}) + + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc" + "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() \ No newline at end of file diff --git a/src/language_identifier_features_test.cc b/src/language_identifier_features_test.cc index 5835c86..05fb86c 100644 --- a/src/language_identifier_features_test.cc +++ b/src/language_identifier_features_test.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include #include +#include #include "base.h" #include "feature_extractor.h" From 414d3e3e0273ee680df7fdf97a1b05b938395c28 Mon Sep 17 00:00:00 2001 From: Takuto Ikuta Date: Thu, 14 Feb 2019 07:07:55 +0900 Subject: [PATCH 03/13] small fix for BUILD.gn --- src/BUILD.gn | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/BUILD.gn b/src/BUILD.gn index cf2a75c..80b912e 100644 --- a/src/BUILD.gn +++ b/src/BUILD.gn @@ -88,8 +88,6 @@ static_library("cld_3") { ] public_deps = [ "//third_party/protobuf:protobuf_lite", - ] - deps = [ ":protos", ] } From 9239af86a0157f44a5d5d49e765ee7c7d099abb8 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 21 Feb 2019 10:25:23 -0500 Subject: [PATCH 04/13] Fix -Wextra-semi warnings. Needed for https://crbug.com/926235 --- src/script_span/getonescriptspan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/script_span/getonescriptspan.h b/src/script_span/getonescriptspan.h index 1fa60ad..33a7130 100644 --- a/src/script_span/getonescriptspan.h +++ b/src/script_span/getonescriptspan.h @@ -93,7 +93,7 @@ class ScriptScanner { // again with the first byte of the following range. int MapBack(int text_offset); - const char* GetBufferStart() {return start_byte_;}; + const char* GetBufferStart() {return start_byte_;} private: // Skip over tags and non-letters From 6917502dea91e7af2348e53b8388529810da8d02 Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Fri, 5 Apr 2019 10:09:59 -0700 Subject: [PATCH 05/13] Still need to implement tests. --- src/nnet_language_identifier.cc | 6 ++++++ src/nnet_language_identifier.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index abc3950..3d64cfc 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -47,6 +47,9 @@ struct LangChunksStats { // Number chunks corresponding to the language. int num_chunks = 0; + + // Specifies the ranges of text that language applies to. + std::vector> ranges; }; // Compares two pairs based on their values. @@ -304,6 +307,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, lang_stats[language].prob_sum += result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; + lang_stats[language].ranges.push_back(std::make_pair( + script_span.offset, script_span.offset + script_span.text_bytes)); } // Sort the languages based on the number of bytes associated with them. @@ -329,6 +334,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, result.probability = stats.prob_sum / stats.byte_sum; result.proportion = stats.byte_sum / byte_sum; result.is_reliable = ResultIsReliable(language, result.probability); + result.ranges = stats.ranges; results.push_back(result); } diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h index 820aba6..95fccb7 100644 --- a/src/nnet_language_identifier.h +++ b/src/nnet_language_identifier.h @@ -53,6 +53,9 @@ class NNetLanguageIdentifier { // Proportion of bytes associated with the language. If FindLanguage is // called, this variable is set to 1. float proportion = 0.0; + + // Specifies the ranges of input text that this.language applies to. + std::vector> ranges; }; NNetLanguageIdentifier(); From ce81de5584763f03d192443056c6bb8aded867e0 Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Fri, 5 Apr 2019 11:04:42 -0700 Subject: [PATCH 06/13] Add test for ranges. --- src/nnet_lang_id_test.cc | 28 ++++++++++++++++++++++++++++ src/nnet_language_identifier.cc | 4 ++-- src/nnet_language_identifier.h | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc index 358fe1b..1682a82 100644 --- a/src/nnet_lang_id_test.cc +++ b/src/nnet_lang_id_test.cc @@ -209,6 +209,34 @@ bool TestMultipleLanguagesInInput() { << result.proportion << std::endl; return false; } + + // Skip over undefined language. + if (result.language.compare("und") == 0) + continue; + if (result.ranges.size() != 1) { + std::cout << " Should only detect one span containing " << result.language + << std::endl; + return false; + } + // Check that specified ranges for language are correct. + int start_index = result.ranges[0].first; + int end_index = result.ranges[0].second; + std::string ranges_text = text.substr(start_index, end_index - start_index); + if (result.language.compare("bg") == 0) { + if (ranges_text.compare("Този текст е на Български.") != 0) { + std::cout << " Incorrect ranges returned for Bulgarian " << std::endl; + return false; + } + } else if (result.language.compare("en") == 0) { + if (ranges_text.compare("This piece of text is in English. ") != 0) { + std::cout << " Incorrect ranges returned for English " << std::endl; + return false; + } + } else { + std::cout << " Got language other than English or Bulgarian " + << std::endl; + return false; + } } std::cout << " Success!" << std::endl; return true; diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index 3d64cfc..f44e845 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -49,7 +49,7 @@ struct LangChunksStats { int num_chunks = 0; // Specifies the ranges of text that language applies to. - std::vector> ranges; + std::vector> ranges; }; // Compares two pairs based on their values. @@ -308,7 +308,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; lang_stats[language].ranges.push_back(std::make_pair( - script_span.offset, script_span.offset + script_span.text_bytes)); + script_span.offset, script_span.offset + script_span.text_bytes)); } // Sort the languages based on the number of bytes associated with them. diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h index 95fccb7..46b7670 100644 --- a/src/nnet_language_identifier.h +++ b/src/nnet_language_identifier.h @@ -55,7 +55,7 @@ class NNetLanguageIdentifier { float proportion = 0.0; // Specifies the ranges of input text that this.language applies to. - std::vector> ranges; + std::vector> ranges; }; NNetLanguageIdentifier(); From 3767894623b59f222515cee18d4cbdb6155b0e94 Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Fri, 12 Apr 2019 15:38:02 -0700 Subject: [PATCH 07/13] Ensure returned indices are relative to original input. --- src/nnet_language_identifier.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index f44e845..901440b 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -33,6 +33,8 @@ limitations under the License. #include "task_context.h" #include "workspace.h" +#include + namespace chrome_lang_id { namespace { @@ -301,14 +303,16 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, total_num_bytes += num_original_span_bytes; const string selected_text = SelectTextGivenScriptSpan(script_span); + result = FindLanguageOfValidUTF8(selected_text); language = result.language; lang_stats[language].byte_sum += num_original_span_bytes; lang_stats[language].prob_sum += result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; + // Set start and end indices relative to the original input. lang_stats[language].ranges.push_back(std::make_pair( - script_span.offset, script_span.offset + script_span.text_bytes)); + ss.MapBack(0), ss.MapBack(script_span.text_bytes))); } // Sort the languages based on the number of bytes associated with them. From d77234a1eaf08f7b0981b1cba9db323c33f13141 Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Fri, 12 Apr 2019 15:39:08 -0700 Subject: [PATCH 08/13] Code cleanup. --- src/nnet_language_identifier.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index 901440b..b455079 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -33,8 +33,6 @@ limitations under the License. #include "task_context.h" #include "workspace.h" -#include - namespace chrome_lang_id { namespace { @@ -303,7 +301,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, total_num_bytes += num_original_span_bytes; const string selected_text = SelectTextGivenScriptSpan(script_span); - result = FindLanguageOfValidUTF8(selected_text); language = result.language; lang_stats[language].byte_sum += num_original_span_bytes; From 7dedf52728dabeeb2120238708ae5e6f5fc47e69 Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Wed, 17 Apr 2019 11:08:37 -0700 Subject: [PATCH 09/13] Associate probability with each SpanInfo. --- src/nnet_lang_id_test.cc | 4 ++-- src/nnet_language_identifier.cc | 10 +++++----- src/nnet_language_identifier.h | 17 +++++++++++++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc index 1682a82..ee16092 100644 --- a/src/nnet_lang_id_test.cc +++ b/src/nnet_lang_id_test.cc @@ -219,8 +219,8 @@ bool TestMultipleLanguagesInInput() { return false; } // Check that specified ranges for language are correct. - int start_index = result.ranges[0].first; - int end_index = result.ranges[0].second; + int start_index = result.ranges[0].start_index; + int end_index = result.ranges[0].end_index; std::string ranges_text = text.substr(start_index, end_index - start_index); if (result.language.compare("bg") == 0) { if (ranges_text.compare("Този текст е на Български.") != 0) { diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index b455079..fc115db 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -48,8 +48,8 @@ struct LangChunksStats { // Number chunks corresponding to the language. int num_chunks = 0; - // Specifies the ranges of text that language applies to. - std::vector> ranges; + // Specifies the spans of text that language applies to. + std::vector ranges; }; // Compares two pairs based on their values. @@ -307,9 +307,9 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, lang_stats[language].prob_sum += result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; - // Set start and end indices relative to the original input. - lang_stats[language].ranges.push_back(std::make_pair( - ss.MapBack(0), ss.MapBack(script_span.text_bytes))); + // Add SpanInfo. Start and end indices are relative to original input. + lang_stats[language].ranges.push_back(SpanInfo( + ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability)); } // Sort the languages based on the number of bytes associated with them. diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h index 46b7670..1d44444 100644 --- a/src/nnet_language_identifier.h +++ b/src/nnet_language_identifier.h @@ -44,6 +44,19 @@ class LanguageIdEmbeddingFeatureExtractor // Class for detecting the language of a document. class NNetLanguageIdentifier { public: + // Holds probability that Span, specified by start/end indices, is a given + // language. The langauge is not stored here; it can be found in Result, which + // holds a vector of SpanInfo. + struct SpanInfo { + SpanInfo(int start_index_val, int end_index_val, float probability_val) + : start_index(start_index_val), + end_index(end_index_val), + probability(probability_val) {} + int start_index = -1; + int end_index = -1; + float probability = 0.0; + }; + // Information about a predicted language. struct Result { string language = kUnknown; @@ -54,8 +67,8 @@ class NNetLanguageIdentifier { // called, this variable is set to 1. float proportion = 0.0; - // Specifies the ranges of input text that this.language applies to. - std::vector> ranges; + // Specifies the spans of input text that |language| applies to. + std::vector ranges; }; NNetLanguageIdentifier(); From a924f910579731a13a83bfd1aa43e28e351265da Mon Sep 17 00:00:00 2001 From: Akihiro Ota Date: Tue, 23 Apr 2019 13:34:48 -0700 Subject: [PATCH 10/13] Respond to feedback. --- src/nnet_lang_id_test.cc | 24 ++++++++++++------------ src/nnet_language_identifier.cc | 9 +++++---- src/nnet_language_identifier.h | 4 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc index ee16092..a7a2520 100644 --- a/src/nnet_lang_id_test.cc +++ b/src/nnet_lang_id_test.cc @@ -211,25 +211,25 @@ bool TestMultipleLanguagesInInput() { } // Skip over undefined language. - if (result.language.compare("und") == 0) + if (result.language == "und") continue; - if (result.ranges.size() != 1) { + if (result.byte_ranges.size() != 1) { std::cout << " Should only detect one span containing " << result.language << std::endl; return false; } - // Check that specified ranges for language are correct. - int start_index = result.ranges[0].start_index; - int end_index = result.ranges[0].end_index; - std::string ranges_text = text.substr(start_index, end_index - start_index); - if (result.language.compare("bg") == 0) { - if (ranges_text.compare("Този текст е на Български.") != 0) { - std::cout << " Incorrect ranges returned for Bulgarian " << std::endl; + // Check that specified byte ranges for language are correct. + int start_index = result.byte_ranges[0].start_index; + int end_index = result.byte_ranges[0].end_index; + std::string byte_ranges_text = text.substr(start_index, end_index - start_index); + if (result.language == "bg") { + if (byte_ranges_text.compare("Този текст е на Български.") != 0) { + std::cout << " Incorrect byte ranges returned for Bulgarian " << std::endl; return false; } - } else if (result.language.compare("en") == 0) { - if (ranges_text.compare("This piece of text is in English. ") != 0) { - std::cout << " Incorrect ranges returned for English " << std::endl; + } else if (result.language == "en") { + if (byte_ranges_text.compare("This piece of text is in English. ") != 0) { + std::cout << " Incorrect byte ranges returned for English " << std::endl; return false; } } else { diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc index fc115db..c1fa755 100644 --- a/src/nnet_language_identifier.cc +++ b/src/nnet_language_identifier.cc @@ -48,8 +48,8 @@ struct LangChunksStats { // Number chunks corresponding to the language. int num_chunks = 0; - // Specifies the spans of text that language applies to. - std::vector ranges; + // Specifies the byte ranges that language applies to. + std::vector byte_ranges; }; // Compares two pairs based on their values. @@ -301,6 +301,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, total_num_bytes += num_original_span_bytes; const string selected_text = SelectTextGivenScriptSpan(script_span); + result = FindLanguageOfValidUTF8(selected_text); language = result.language; lang_stats[language].byte_sum += num_original_span_bytes; @@ -308,7 +309,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, result.probability * num_original_span_bytes; lang_stats[language].num_chunks++; // Add SpanInfo. Start and end indices are relative to original input. - lang_stats[language].ranges.push_back(SpanInfo( + lang_stats[language].byte_ranges.push_back(SpanInfo( ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability)); } @@ -335,7 +336,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text, result.probability = stats.prob_sum / stats.byte_sum; result.proportion = stats.byte_sum / byte_sum; result.is_reliable = ResultIsReliable(language, result.probability); - result.ranges = stats.ranges; + result.byte_ranges = stats.byte_ranges; results.push_back(result); } diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h index 1d44444..e5eb862 100644 --- a/src/nnet_language_identifier.h +++ b/src/nnet_language_identifier.h @@ -67,8 +67,8 @@ class NNetLanguageIdentifier { // called, this variable is set to 1. float proportion = 0.0; - // Specifies the spans of input text that |language| applies to. - std::vector ranges; + // Specifies the byte ranges that |language| applies to. + std::vector byte_ranges; }; NNetLanguageIdentifier(); From 22335cdbee28b16dfc6fecb306207a8d8216aa5c Mon Sep 17 00:00:00 2001 From: Alfredo Luque Date: Fri, 30 Aug 2019 13:34:37 -0700 Subject: [PATCH 11/13] updates --- setup.py | 98 +++++++++++++++++++++++++++++++++ src/cld3.pyx | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 setup.py create mode 100644 src/cld3.pyx diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..bc99068 --- /dev/null +++ b/setup.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +import subprocess +from os import path, makedirs + +from setuptools import setup, Extension +from distutils.command.build import build + +from Cython.Build import cythonize + + +PROTOS = ["src/sentence.proto", "src/feature_extractor.proto", + "src/task_spec.proto"] + +SOURCES = ["src/cld3.pyx", + "src/base.cc", + "src/cld_3/protos/src/feature_extractor.pb.cc", + "src/cld_3/protos/src/sentence.pb.cc", + "src/cld_3/protos/src/task_spec.pb.cc", + "src/embedding_feature_extractor.cc", + "src/embedding_network.cc", + "src/feature_extractor.cc", + "src/feature_types.cc", + "src/fml_parser.cc", + "src/lang_id_nn_params.cc", + "src/language_identifier_features.cc", + "src/nnet_language_identifier.cc", + "src/registry.cc", + "src/relevant_script_feature.cc", + "src/script_span/fixunicodevalue.cc", + "src/script_span/generated_entities.cc", + "src/script_span/generated_ulscript.cc", + "src/script_span/getonescriptspan.cc", + "src/script_span/offsetmap.cc", + "src/script_span/text_processing.cc", + "src/script_span/utf8statetable.cc", + "src/sentence_features.cc", + "src/task_context.cc", + "src/task_context_params.cc", + "src/unicodetext.cc", + "src/utils.cc", + "src/workspace.cc"] + +INCLUDES = ["/usr/local/include","./src", "./src/cld_3/protos/"] + +LIBRARIES = ["protobuf"] + +LONG_DESCRIPTION = \ +"""Python bindings for the CLD3 language classification library by Google.""" + +CLASSIFIERS = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 3", + "Programming Language :: C++", + "Operating System :: OS Independent", + "Development Status :: 4 - Beta", + "Topic :: Text Processing :: Linguistic", + "Intended Audience :: Developers",] + + +class BuildProtobuf(build): + def run(self): + if not path.exists("src/cld_3/protos"): + # Create protobufs dir + makedirs("src/cld_3/protos") + + # Build protobuf stuff + command = ["protoc"] + command.extend(PROTOS) + command.append("--cpp_out={}".format( + path.join("src", "cld_3", "protos"))) + subprocess.run(command, check=True) + + build.run(self) + + +ext = Extension( + "cld3", + sources=SOURCES, + include_dirs=INCLUDES, + libraries=LIBRARIES, + language="c++", + extra_compile_args=["-std=c++11"]) + +setup( + name="cld3", + version="0.2.3", + cmdclass={"build": BuildProtobuf}, + author="Google, Johannes Baiter, Elizabeth Myers", + author_email="elizabeth@interlinked.me", + description="CLD3 Python bindings", + long_description=LONG_DESCRIPTION, + license="Apache2", + keywords=["cld3", "cffi"], + url="https://github.com/iamthebot/cld3", + ext_modules=cythonize([ext])) diff --git a/src/cld3.pyx b/src/cld3.pyx new file mode 100644 index 0000000..ed4eb88 --- /dev/null +++ b/src/cld3.pyx @@ -0,0 +1,153 @@ +from libcpp.vector cimport vector +from libcpp.string cimport string + +from collections import namedtuple + + +cdef extern from "nnet_language_identifier.h" namespace "chrome_lang_id::NNetLanguageIdentifier": + cdef struct Result: + string language + float probability + bint is_reliable + float proportion + + +cdef extern from "nnet_language_identifier.h" namespace "chrome_lang_id": + cdef cppclass NNetLanguageIdentifier: + NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes); + Result FindLanguage(string &text) + vector[Result] FindTopNMostFreqLangs(string &text, int num_langs) + const char kUnknown[] + + +LanguagePrediction = namedtuple("LanguagePrediction", + ("language", "probability", "is_reliable", + "proportion")) + +cdef class LanguageIdentifier: + """ + Basic Python API for using CLD3 + """ + cdef NNetLanguageIdentifier* model + cdef unsigned int min_bytes + cdef unsigned int max_bytes + + def __init__(self, min_bytes=0, max_bytes=1024): + """ + Initialize a LanguageIdentifier + + :param min_bytes: The minimum number of bytes to look at for the prediction. + :param max_bytes: The maximum number of bytes to consider + """ + self.min_bytes = min_bytes + self.max_bytes = max_bytes + self.model = new NNetLanguageIdentifier(self.min_bytes, self.max_bytes) + + def get_language(self, unicode text): + """Get the most likely language for the given text. + + The prediction is based on the first N bytes where N is the minumum between + the number of interchange valid UTF8 bytes and max_bytes. If N is less + than min_bytes long, then this function returns None. + + If the language cannot be determined, None will be returned. + """ + cdef Result res = self.model.FindLanguage(text.encode('utf8')) + + if str(res.language) != self.model.kUnknown: + language = res.language.decode('utf8') + return LanguagePrediction(language, res.probability, res.is_reliable, + res.proportion) + else: + return None + + def get_frequent_languages( + self, + unicode text, + int num_langs, + ): + """Find the most frequent languages in the given text. + + Splits the input text (up to the first byte, if any, that is not + interchange valid UTF8) into spans based on the script, predicts a language + for each span, and returns a list storing the top num_langs most frequent + languages along with additional information (e.g., proportions). The number + of bytes considered for each span is the minimum between the size of the + span and max_bytes. If more languages are requested than what is available + in the input, then the list returned will only have the number of results + found. Also, if the size of the span is less than min_bytes long, then the + span is skipped. If the input text is too long, only the first 1000 bytes + are processed. + """ + cdef vector[Result] results = self.model.FindTopNMostFreqLangs( + text.encode('utf8'), + num_langs + ) + out = [] + for res in results: + if str(res.language) != self.model.kUnknown: + language = res.language.decode('utf8') + out.append(LanguagePrediction( + language, res.probability, res.is_reliable, res.proportion)) + return out + + +def get_language(unicode text, unsigned int min_bytes=0, unsigned int max_bytes=1000): + """Get the most likely language for the given text. + + The prediction is based on the first N bytes where N is the minumum between + the number of interchange valid UTF8 bytes and max_bytes. If N is less + than min_bytes long, then this function returns None. + + If the language cannot be determined, None will be returned. + + This function requires initialization of a new identifier on each call so it's best + to use the LanguageIdentifier class instead for multiple calls + """ + cdef NNetLanguageIdentifier* ident = new NNetLanguageIdentifier(min_bytes, max_bytes) + cdef Result res = ident.FindLanguage(text.encode('utf8')) + del ident + if str(res.language) != ident.kUnknown: + language = res.language.decode('utf8') + return LanguagePrediction(language, res.probability, res.is_reliable, + res.proportion) + else: + return None + + + +def get_frequent_languages( + unicode text, + unsigned int num_langs, + unsigned int min_bytes=0, + int max_bytes=1000 +): + """Find the most frequent languages in the given text. + + Splits the input text (up to the first byte, if any, that is not + interchange valid UTF8) into spans based on the script, predicts a language + for each span, and returns a list storing the top num_langs most frequent + languages along with additional information (e.g., proportions). The number + of bytes considered for each span is the minimum between the size of the + span and max_bytes. If more languages are requested than what is available + in the input, then the list returned will only have the number of results + found. Also, if the size of the span is less than min_bytes long, then the + span is skipped. If the input text is too long, only the first 1000 bytes + are processed. + + This function requires initialization of a new identifier on each call so it's best + to use the LanguageIdentifier class instead for multiple calls + """ + cdef NNetLanguageIdentifier* ident = new NNetLanguageIdentifier(min_bytes, max_bytes) + cdef vector[Result] results = ident.FindTopNMostFreqLangs( + text.encode('utf8'), + num_langs + ) + del ident + out = [] + for res in results: + if str(res.language) != ident.kUnknown: + language = res.language.decode('utf8') + out.append(LanguagePrediction( + language, res.probability, res.is_reliable, res.proportion)) + return out From e58f92e1e502133ca5c90604ad63b450dd8e7059 Mon Sep 17 00:00:00 2001 From: Alfredo Luque Date: Fri, 30 Aug 2019 13:40:56 -0700 Subject: [PATCH 12/13] update readme --- README.md | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8600833..77f2659 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,15 @@ -# Compact Language Detector v3 (CLD3) +# Compact Language Detector v3 (CLD3) Python Edition * [Model](#model) * [Installation](#installation) * [Contact](#contact) * [Credits](#credits) +### Notes +This is an effort to fix issues with the initial Python fork at https://github.com/Elizafox/cld3 +including memory leaks and no reuse of a Language model for multiple calls. This also pulls in much newer cld3 upstream code. + + ### Model CLD3 is a neural network model for language identification. This package @@ -27,29 +32,21 @@ To get a language prediction for the input text, we simply perform a forward ![Figure](model.png "CLD3") ### Installation -CLD3 is designed to run in the Chrome browser, so it relies on code in -[Chromium](http://www.chromium.org/). -The steps for building and running the demo of the language detection model are: - -- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the - Chromium repository. -- copy the code to `//third_party/cld_3` -- Uncomment `language_identifier_main` executable in `src/BUILD.gn`. -- build and run the model using the commands: - -```shell -gn gen out/Default -ninja -C out/Default third_party/cld_3/src/src:language_identifier_main -out/Default/language_identifier_main +Building the Python wheel requires the protobuf compiler and its headers to be installed. +If you run into issues with protobufs not compiling, just go into the `src` directory and run + +``` +mkdir -p cld_3/protos +protoc --cpp_out=cld_3/protos *.proto ``` -### Bugs and Feature Requests -Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests. +To generate a python wheel (from the root of this repo): -### Announcements and Discussion +``` +python setup.py bdist_wheel +``` -For announcements regarding major updates as well as general discussion list, please subscribe to: -[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users) +Builds have been tested with GCC9.0 on Ubuntu 18.04 and Apple Clang 11.0.0 on OSX 10.15 (Catalina Beta) ### Credits @@ -71,3 +68,5 @@ Original authors of the code in this package include (in alphabetical order): * Slav Petrov * Stefan Istrate * Terry Koo + +and Elizabeth Myers for the original Python bindings From 93b53a87e138ff2d78ffaf52a01ac01ec04570d9 Mon Sep 17 00:00:00 2001 From: Alfredo Luque Date: Fri, 30 Aug 2019 15:46:57 -0700 Subject: [PATCH 13/13] fix protobuf --- setup.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index bc99068..332f46f 100644 --- a/setup.py +++ b/setup.py @@ -9,14 +9,14 @@ from Cython.Build import cythonize -PROTOS = ["src/sentence.proto", "src/feature_extractor.proto", - "src/task_spec.proto"] +PROTOS = ["sentence.proto", "feature_extractor.proto", + "task_spec.proto"] SOURCES = ["src/cld3.pyx", "src/base.cc", - "src/cld_3/protos/src/feature_extractor.pb.cc", - "src/cld_3/protos/src/sentence.pb.cc", - "src/cld_3/protos/src/task_spec.pb.cc", + "src/cld_3/protos/feature_extractor.pb.cc", + "src/cld_3/protos/sentence.pb.cc", + "src/cld_3/protos/task_spec.pb.cc", "src/embedding_feature_extractor.cc", "src/embedding_network.cc", "src/feature_extractor.cc", @@ -41,7 +41,7 @@ "src/utils.cc", "src/workspace.cc"] -INCLUDES = ["/usr/local/include","./src", "./src/cld_3/protos/"] +INCLUDES = ["/usr/local/include","/src", "./src/cld_3/protos/"] LIBRARIES = ["protobuf"] @@ -70,8 +70,8 @@ def run(self): command = ["protoc"] command.extend(PROTOS) command.append("--cpp_out={}".format( - path.join("src", "cld_3", "protos"))) - subprocess.run(command, check=True) + path.join("cld_3", "protos"))) + subprocess.run(command, check=True, cwd='src') build.run(self) @@ -84,6 +84,7 @@ def run(self): language="c++", extra_compile_args=["-std=c++11"]) + setup( name="cld3", version="0.2.3",