From c3430fd5702a78ce16476a7a28661038ae47ead2 Mon Sep 17 00:00:00 2001
From: WilliamTambellini <wtambellini@sdl.com>
Date: Sat, 22 Dec 2018 18:09:36 -0800
Subject: [PATCH 01/13] Add CMake script

Add CMake script to buid a static lib cld3 and unittests.
No dependancy with Chrome.
---
 .gitignore                               |  1 +
 CMakeLists.txt                           | 69 ++++++++++++++++++++++++
 misc/myprotobuf.cmake                    | 58 ++++++++++++++++++++
 src/language_identifier_features_test.cc |  1 +
 4 files changed, 129 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 misc/myprotobuf.cmake

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..378eac2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..732a8ae
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,69 @@
+# This cmake scripts only builds a static cld3 lib and the unittests.
+
+project(cld3)
+
+# Old versions of cmake dont search/find protobuf lite
+cmake_minimum_required(VERSION 3.9)
+
+find_package(Protobuf REQUIRED)
+message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
+message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
+message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
+message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
+message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so
+
+# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir. 
+# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h. 
+# So *.pb.h must be output to cld_3/protos.
+# For that, let's use a custom my_protobuf_generate_cpp:
+include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
+my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
+message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")
+
+add_definitions(-fPIC) # Position Independant Code
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+add_definitions(-std=c++11) # Needed for std::to_string(), ...
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers
+
+add_library(${PROJECT_NAME} 
+	${PROTO_SRCS} ${PROTO_HDRS}
+	src/base.cc
+	src/embedding_feature_extractor.cc
+	src/embedding_network.cc
+	src/feature_extractor.cc
+	src/feature_extractor.h
+	src/feature_types.cc
+	src/fml_parser.cc
+	src/language_identifier_features.cc
+	src/lang_id_nn_params.cc 
+	src/nnet_language_identifier.cc
+	src/registry.cc
+	src/relevant_script_feature.cc
+	src/sentence_features.cc
+	src/task_context.cc
+	src/task_context_params.cc
+	src/unicodetext.cc
+	src/utils.cc
+	src/workspace.cc
+	
+	src/script_span/generated_entities.cc
+	src/script_span/getonescriptspan.cc
+	src/script_span/getonescriptspan.h
+	src/script_span/getonescriptspan_test.cc
+	src/script_span/utf8statetable.cc
+	src/script_span/offsetmap.cc
+	src/script_span/text_processing.cc
+	src/script_span/text_processing.h
+	src/script_span/fixunicodevalue.cc
+	)
+
+# unit tests exec:
+add_executable(language_identifier_main src/language_identifier_main.cc)
+target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})
+
+add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
+target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})
+
+add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
+target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})
diff --git a/misc/myprotobuf.cmake b/misc/myprotobuf.cmake
new file mode 100644
index 0000000..c8d4242
--- /dev/null
+++ b/misc/myprotobuf.cmake
@@ -0,0 +1,58 @@
+# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
+# From https://stackoverflow.com/users/1600278/akira-okumura
+
+function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
+    return()
+  endif()
+
+  if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+    # Create an include path for each file specified
+    foreach(FIL ${ARGN})
+      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+          list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
+    endforeach()
+  else()
+    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
+  if(DEFINED PROTOBUF_IMPORT_DIRS)
+    foreach(DIR ${PROTOBUF_IMPORT_DIRS})
+      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+          list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
+    endforeach()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+
+    list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
+    list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")
+
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})
+
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
+             "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
+      COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE}
+      ARGS --cpp_out  ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
+      DEPENDS ${ABS_FIL}
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/src/language_identifier_features_test.cc b/src/language_identifier_features_test.cc
index 5835c86..05fb86c 100644
--- a/src/language_identifier_features_test.cc
+++ b/src/language_identifier_features_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <iostream>
 #include <vector>
+#include <set>
 
 #include "base.h"
 #include "feature_extractor.h"

From 86c4d70aaa89a478aff777d188003cc807d20dbe Mon Sep 17 00:00:00 2001
From: WilliamTambellini <wtambellini@sdl.com>
Date: Sat, 22 Dec 2018 18:09:36 -0800
Subject: [PATCH 02/13] Add CMake script

Add CMake script to buid a static lib cld3 and unittests.
No dependancy with Chrome.
---
 .gitignore                               |  1 +
 CMakeLists.txt                           | 68 ++++++++++++++++++++++++
 misc/myprotobuf.cmake                    | 58 ++++++++++++++++++++
 src/language_identifier_features_test.cc |  1 +
 4 files changed, 128 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 misc/myprotobuf.cmake

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..378eac2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..97ca1bd
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,68 @@
+# This cmake scripts only builds a static cld3 lib and the unittests.
+project(cld3)
+
+# Old versions of cmake dont search/find protobuf lite
+cmake_minimum_required(VERSION 3.9)
+
+find_package(Protobuf REQUIRED)
+message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
+message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
+message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
+message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
+message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so
+
+# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir. 
+# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h. 
+# So *.pb.h must be output to cld_3/protos.
+# For that, let's use a custom my_protobuf_generate_cpp:
+include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
+my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
+message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")
+
+add_definitions(-fPIC) # Position Independant Code
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+add_definitions(-std=c++11) # Needed for std::to_string(), ...
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers
+
+add_library(${PROJECT_NAME} 
+	${PROTO_SRCS} ${PROTO_HDRS}
+	src/base.cc
+	src/embedding_feature_extractor.cc
+	src/embedding_network.cc
+	src/feature_extractor.cc
+	src/feature_extractor.h
+	src/feature_types.cc
+	src/fml_parser.cc
+	src/language_identifier_features.cc
+	src/lang_id_nn_params.cc 
+	src/nnet_language_identifier.cc
+	src/registry.cc
+	src/relevant_script_feature.cc
+	src/sentence_features.cc
+	src/task_context.cc
+	src/task_context_params.cc
+	src/unicodetext.cc
+	src/utils.cc
+	src/workspace.cc
+	
+	src/script_span/generated_entities.cc
+	src/script_span/getonescriptspan.cc
+	src/script_span/getonescriptspan.h
+	src/script_span/getonescriptspan_test.cc
+	src/script_span/utf8statetable.cc
+	src/script_span/offsetmap.cc
+	src/script_span/text_processing.cc
+	src/script_span/text_processing.h
+	src/script_span/fixunicodevalue.cc
+	)
+
+# unit tests exec:
+add_executable(language_identifier_main src/language_identifier_main.cc)
+target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})
+
+add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
+target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})
+
+add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
+target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})
diff --git a/misc/myprotobuf.cmake b/misc/myprotobuf.cmake
new file mode 100644
index 0000000..c8d4242
--- /dev/null
+++ b/misc/myprotobuf.cmake
@@ -0,0 +1,58 @@
+# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
+# From https://stackoverflow.com/users/1600278/akira-okumura
+
+function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
+    return()
+  endif()
+
+  if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+    # Create an include path for each file specified
+    foreach(FIL ${ARGN})
+      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+          list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
+    endforeach()
+  else()
+    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
+  if(DEFINED PROTOBUF_IMPORT_DIRS)
+    foreach(DIR ${PROTOBUF_IMPORT_DIRS})
+      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+      if(${_contains_already} EQUAL -1)
+          list(APPEND _protobuf_include_path -I ${ABS_PATH})
+      endif()
+    endforeach()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+
+    list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
+    list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")
+
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})
+
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
+             "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
+      COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE}
+      ARGS --cpp_out  ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
+      DEPENDS ${ABS_FIL}
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/src/language_identifier_features_test.cc b/src/language_identifier_features_test.cc
index 5835c86..05fb86c 100644
--- a/src/language_identifier_features_test.cc
+++ b/src/language_identifier_features_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <iostream>
 #include <vector>
+#include <set>
 
 #include "base.h"
 #include "feature_extractor.h"

From 414d3e3e0273ee680df7fdf97a1b05b938395c28 Mon Sep 17 00:00:00 2001
From: Takuto Ikuta <tikuta@google.com>
Date: Thu, 14 Feb 2019 07:07:55 +0900
Subject: [PATCH 03/13] small fix for BUILD.gn

---
 src/BUILD.gn | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/BUILD.gn b/src/BUILD.gn
index cf2a75c..80b912e 100644
--- a/src/BUILD.gn
+++ b/src/BUILD.gn
@@ -88,8 +88,6 @@ static_library("cld_3") {
   ]
   public_deps = [
     "//third_party/protobuf:protobuf_lite",
-  ]
-  deps = [
     ":protos",
   ]
 }

From 9239af86a0157f44a5d5d49e765ee7c7d099abb8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 21 Feb 2019 10:25:23 -0500
Subject: [PATCH 04/13] Fix -Wextra-semi warnings.

Needed for https://crbug.com/926235
---
 src/script_span/getonescriptspan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/script_span/getonescriptspan.h b/src/script_span/getonescriptspan.h
index 1fa60ad..33a7130 100644
--- a/src/script_span/getonescriptspan.h
+++ b/src/script_span/getonescriptspan.h
@@ -93,7 +93,7 @@ class ScriptScanner {
   // again with the first byte of the following range.
   int MapBack(int text_offset);
 
-  const char* GetBufferStart() {return start_byte_;};
+  const char* GetBufferStart() {return start_byte_;}
 
  private:
   // Skip over tags and non-letters

From 6917502dea91e7af2348e53b8388529810da8d02 Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Fri, 5 Apr 2019 10:09:59 -0700
Subject: [PATCH 05/13] Still need to implement tests.

---
 src/nnet_language_identifier.cc | 6 ++++++
 src/nnet_language_identifier.h  | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index abc3950..3d64cfc 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -47,6 +47,9 @@ struct LangChunksStats {
 
   // Number chunks corresponding to the language.
   int num_chunks = 0;
+
+  // Specifies the ranges of text that language applies to.
+  std::vector<std::pair<int,int>> ranges;
 };
 
 // Compares two pairs based on their values.
@@ -304,6 +307,8 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     lang_stats[language].prob_sum +=
         result.probability * num_original_span_bytes;
     lang_stats[language].num_chunks++;
+    lang_stats[language].ranges.push_back(std::make_pair(
+      script_span.offset, script_span.offset + script_span.text_bytes));
   }
 
   // Sort the languages based on the number of bytes associated with them.
@@ -329,6 +334,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     result.probability = stats.prob_sum / stats.byte_sum;
     result.proportion = stats.byte_sum / byte_sum;
     result.is_reliable = ResultIsReliable(language, result.probability);
+    result.ranges = stats.ranges;
     results.push_back(result);
   }
 
diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h
index 820aba6..95fccb7 100644
--- a/src/nnet_language_identifier.h
+++ b/src/nnet_language_identifier.h
@@ -53,6 +53,9 @@ class NNetLanguageIdentifier {
     // Proportion of bytes associated with the language. If FindLanguage is
     // called, this variable is set to 1.
     float proportion = 0.0;
+
+    // Specifies the ranges of input text that this.language applies to.
+    std::vector<std::pair<int,int>> ranges;
   };
 
   NNetLanguageIdentifier();

From ce81de5584763f03d192443056c6bb8aded867e0 Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Fri, 5 Apr 2019 11:04:42 -0700
Subject: [PATCH 06/13] Add test for ranges.

---
 src/nnet_lang_id_test.cc        | 28 ++++++++++++++++++++++++++++
 src/nnet_language_identifier.cc |  4 ++--
 src/nnet_language_identifier.h  |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc
index 358fe1b..1682a82 100644
--- a/src/nnet_lang_id_test.cc
+++ b/src/nnet_lang_id_test.cc
@@ -209,6 +209,34 @@ bool TestMultipleLanguagesInInput() {
                 << result.proportion << std::endl;
       return false;
     }
+
+    // Skip over undefined language.
+    if (result.language.compare("und") == 0)
+      continue;
+    if (result.ranges.size() != 1) {
+      std::cout << " Should only detect one span containing " << result.language
+                << std::endl;
+      return false;
+    }
+    // Check that specified ranges for language are correct.
+    int start_index = result.ranges[0].first;
+    int end_index = result.ranges[0].second;
+    std::string ranges_text = text.substr(start_index, end_index - start_index);
+    if (result.language.compare("bg") == 0) {
+      if (ranges_text.compare("Този текст е на Български.") != 0) {
+        std::cout << " Incorrect ranges returned for Bulgarian " << std::endl;
+        return false;
+      }
+    } else if (result.language.compare("en") == 0) {
+      if (ranges_text.compare("This piece of text is in English. ") != 0) {
+        std::cout << " Incorrect ranges returned for English " << std::endl;
+        return false;
+      }
+    } else {
+      std::cout << " Got language other than English or Bulgarian "
+                << std::endl;
+      return false;
+    }
   }
   std::cout << "  Success!" << std::endl;
   return true;
diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index 3d64cfc..f44e845 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -49,7 +49,7 @@ struct LangChunksStats {
   int num_chunks = 0;
 
   // Specifies the ranges of text that language applies to.
-  std::vector<std::pair<int,int>> ranges;
+  std::vector<std::pair<int, int>> ranges;
 };
 
 // Compares two pairs based on their values.
@@ -308,7 +308,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
         result.probability * num_original_span_bytes;
     lang_stats[language].num_chunks++;
     lang_stats[language].ranges.push_back(std::make_pair(
-      script_span.offset, script_span.offset + script_span.text_bytes));
+        script_span.offset, script_span.offset + script_span.text_bytes));
   }
 
   // Sort the languages based on the number of bytes associated with them.
diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h
index 95fccb7..46b7670 100644
--- a/src/nnet_language_identifier.h
+++ b/src/nnet_language_identifier.h
@@ -55,7 +55,7 @@ class NNetLanguageIdentifier {
     float proportion = 0.0;
 
     // Specifies the ranges of input text that this.language applies to.
-    std::vector<std::pair<int,int>> ranges;
+    std::vector<std::pair<int, int>> ranges;
   };
 
   NNetLanguageIdentifier();

From 3767894623b59f222515cee18d4cbdb6155b0e94 Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Fri, 12 Apr 2019 15:38:02 -0700
Subject: [PATCH 07/13] Ensure returned indices are relative to original input.

---
 src/nnet_language_identifier.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index f44e845..901440b 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "task_context.h"
 #include "workspace.h"
 
+#include <iostream>
+
 namespace chrome_lang_id {
 namespace {
 
@@ -301,14 +303,16 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     total_num_bytes += num_original_span_bytes;
 
     const string selected_text = SelectTextGivenScriptSpan(script_span);
+
     result = FindLanguageOfValidUTF8(selected_text);
     language = result.language;
     lang_stats[language].byte_sum += num_original_span_bytes;
     lang_stats[language].prob_sum +=
         result.probability * num_original_span_bytes;
     lang_stats[language].num_chunks++;
+    // Set start and end indices relative to the original input.
     lang_stats[language].ranges.push_back(std::make_pair(
-        script_span.offset, script_span.offset + script_span.text_bytes));
+        ss.MapBack(0), ss.MapBack(script_span.text_bytes)));
   }
 
   // Sort the languages based on the number of bytes associated with them.

From d77234a1eaf08f7b0981b1cba9db323c33f13141 Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Fri, 12 Apr 2019 15:39:08 -0700
Subject: [PATCH 08/13] Code cleanup.

---
 src/nnet_language_identifier.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index 901440b..b455079 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -33,8 +33,6 @@ limitations under the License.
 #include "task_context.h"
 #include "workspace.h"
 
-#include <iostream>
-
 namespace chrome_lang_id {
 namespace {
 
@@ -303,7 +301,6 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     total_num_bytes += num_original_span_bytes;
 
     const string selected_text = SelectTextGivenScriptSpan(script_span);
-
     result = FindLanguageOfValidUTF8(selected_text);
     language = result.language;
     lang_stats[language].byte_sum += num_original_span_bytes;

From 7dedf52728dabeeb2120238708ae5e6f5fc47e69 Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Wed, 17 Apr 2019 11:08:37 -0700
Subject: [PATCH 09/13] Associate probability with each SpanInfo.

---
 src/nnet_lang_id_test.cc        |  4 ++--
 src/nnet_language_identifier.cc | 10 +++++-----
 src/nnet_language_identifier.h  | 17 +++++++++++++++--
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc
index 1682a82..ee16092 100644
--- a/src/nnet_lang_id_test.cc
+++ b/src/nnet_lang_id_test.cc
@@ -219,8 +219,8 @@ bool TestMultipleLanguagesInInput() {
       return false;
     }
     // Check that specified ranges for language are correct.
-    int start_index = result.ranges[0].first;
-    int end_index = result.ranges[0].second;
+    int start_index = result.ranges[0].start_index;
+    int end_index = result.ranges[0].end_index;
     std::string ranges_text = text.substr(start_index, end_index - start_index);
     if (result.language.compare("bg") == 0) {
       if (ranges_text.compare("Този текст е на Български.") != 0) {
diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index b455079..fc115db 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -48,8 +48,8 @@ struct LangChunksStats {
   // Number chunks corresponding to the language.
   int num_chunks = 0;
 
-  // Specifies the ranges of text that language applies to.
-  std::vector<std::pair<int, int>> ranges;
+  // Specifies the spans of text that language applies to.
+  std::vector<NNetLanguageIdentifier::SpanInfo> ranges;
 };
 
 // Compares two pairs based on their values.
@@ -307,9 +307,9 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     lang_stats[language].prob_sum +=
         result.probability * num_original_span_bytes;
     lang_stats[language].num_chunks++;
-    // Set start and end indices relative to the original input.
-    lang_stats[language].ranges.push_back(std::make_pair(
-        ss.MapBack(0), ss.MapBack(script_span.text_bytes)));
+    // Add SpanInfo. Start and end indices are relative to original input.
+    lang_stats[language].ranges.push_back(SpanInfo(
+        ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
   }
 
   // Sort the languages based on the number of bytes associated with them.
diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h
index 46b7670..1d44444 100644
--- a/src/nnet_language_identifier.h
+++ b/src/nnet_language_identifier.h
@@ -44,6 +44,19 @@ class LanguageIdEmbeddingFeatureExtractor
 // Class for detecting the language of a document.
 class NNetLanguageIdentifier {
  public:
+  // Holds probability that Span, specified by start/end indices, is a given
+  // language. The langauge is not stored here; it can be found in Result, which
+  // holds a vector of SpanInfo.
+  struct SpanInfo {
+    SpanInfo(int start_index_val, int end_index_val, float probability_val)
+        : start_index(start_index_val),
+          end_index(end_index_val),
+          probability(probability_val) {}
+    int start_index = -1;
+    int end_index = -1;
+    float probability = 0.0;
+  };
+
   // Information about a predicted language.
   struct Result {
     string language = kUnknown;
@@ -54,8 +67,8 @@ class NNetLanguageIdentifier {
     // called, this variable is set to 1.
     float proportion = 0.0;
 
-    // Specifies the ranges of input text that this.language applies to.
-    std::vector<std::pair<int, int>> ranges;
+    // Specifies the spans of input text that |language| applies to.
+    std::vector<SpanInfo> ranges;
   };
 
   NNetLanguageIdentifier();

From a924f910579731a13a83bfd1aa43e28e351265da Mon Sep 17 00:00:00 2001
From: Akihiro Ota <akihiroota@google.com>
Date: Tue, 23 Apr 2019 13:34:48 -0700
Subject: [PATCH 10/13] Respond to feedback.

---
 src/nnet_lang_id_test.cc        | 24 ++++++++++++------------
 src/nnet_language_identifier.cc |  9 +++++----
 src/nnet_language_identifier.h  |  4 ++--
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/nnet_lang_id_test.cc b/src/nnet_lang_id_test.cc
index ee16092..a7a2520 100644
--- a/src/nnet_lang_id_test.cc
+++ b/src/nnet_lang_id_test.cc
@@ -211,25 +211,25 @@ bool TestMultipleLanguagesInInput() {
     }
 
     // Skip over undefined language.
-    if (result.language.compare("und") == 0)
+    if (result.language == "und")
       continue;
-    if (result.ranges.size() != 1) {
+    if (result.byte_ranges.size() != 1) {
       std::cout << " Should only detect one span containing " << result.language
                 << std::endl;
       return false;
     }
-    // Check that specified ranges for language are correct.
-    int start_index = result.ranges[0].start_index;
-    int end_index = result.ranges[0].end_index;
-    std::string ranges_text = text.substr(start_index, end_index - start_index);
-    if (result.language.compare("bg") == 0) {
-      if (ranges_text.compare("Този текст е на Български.") != 0) {
-        std::cout << " Incorrect ranges returned for Bulgarian " << std::endl;
+    // Check that specified byte ranges for language are correct.
+    int start_index = result.byte_ranges[0].start_index;
+    int end_index = result.byte_ranges[0].end_index;
+    std::string byte_ranges_text = text.substr(start_index, end_index - start_index);
+    if (result.language == "bg") {
+      if (byte_ranges_text.compare("Този текст е на Български.") != 0) {
+        std::cout << " Incorrect byte ranges returned for Bulgarian " << std::endl;
         return false;
       }
-    } else if (result.language.compare("en") == 0) {
-      if (ranges_text.compare("This piece of text is in English. ") != 0) {
-        std::cout << " Incorrect ranges returned for English " << std::endl;
+    } else if (result.language == "en") {
+      if (byte_ranges_text.compare("This piece of text is in English. ") != 0) {
+        std::cout << " Incorrect byte ranges returned for English " << std::endl;
         return false;
       }
     } else {
diff --git a/src/nnet_language_identifier.cc b/src/nnet_language_identifier.cc
index fc115db..c1fa755 100644
--- a/src/nnet_language_identifier.cc
+++ b/src/nnet_language_identifier.cc
@@ -48,8 +48,8 @@ struct LangChunksStats {
   // Number chunks corresponding to the language.
   int num_chunks = 0;
 
-  // Specifies the spans of text that language applies to.
-  std::vector<NNetLanguageIdentifier::SpanInfo> ranges;
+  // Specifies the byte ranges that language applies to.
+  std::vector<NNetLanguageIdentifier::SpanInfo> byte_ranges;
 };
 
 // Compares two pairs based on their values.
@@ -301,6 +301,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     total_num_bytes += num_original_span_bytes;
 
     const string selected_text = SelectTextGivenScriptSpan(script_span);
+
     result = FindLanguageOfValidUTF8(selected_text);
     language = result.language;
     lang_stats[language].byte_sum += num_original_span_bytes;
@@ -308,7 +309,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
         result.probability * num_original_span_bytes;
     lang_stats[language].num_chunks++;
     // Add SpanInfo. Start and end indices are relative to original input.
-    lang_stats[language].ranges.push_back(SpanInfo(
+    lang_stats[language].byte_ranges.push_back(SpanInfo(
         ss.MapBack(0), ss.MapBack(script_span.text_bytes), result.probability));
   }
 
@@ -335,7 +336,7 @@ NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
     result.probability = stats.prob_sum / stats.byte_sum;
     result.proportion = stats.byte_sum / byte_sum;
     result.is_reliable = ResultIsReliable(language, result.probability);
-    result.ranges = stats.ranges;
+    result.byte_ranges = stats.byte_ranges;
     results.push_back(result);
   }
 
diff --git a/src/nnet_language_identifier.h b/src/nnet_language_identifier.h
index 1d44444..e5eb862 100644
--- a/src/nnet_language_identifier.h
+++ b/src/nnet_language_identifier.h
@@ -67,8 +67,8 @@ class NNetLanguageIdentifier {
     // called, this variable is set to 1.
     float proportion = 0.0;
 
-    // Specifies the spans of input text that |language| applies to.
-    std::vector<SpanInfo> ranges;
+    // Specifies the byte ranges that |language| applies to.
+    std::vector<SpanInfo> byte_ranges;
   };
 
   NNetLanguageIdentifier();

From 22335cdbee28b16dfc6fecb306207a8d8216aa5c Mon Sep 17 00:00:00 2001
From: Alfredo Luque <alfredo.luque@airbnb.com>
Date: Fri, 30 Aug 2019 13:34:37 -0700
Subject: [PATCH 11/13] updates

---
 setup.py     |  98 +++++++++++++++++++++++++++++++++
 src/cld3.pyx | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 251 insertions(+)
 create mode 100644 setup.py
 create mode 100644 src/cld3.pyx

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bc99068
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+import subprocess
+from os import path, makedirs
+
+from setuptools import setup, Extension
+from distutils.command.build import build
+
+from Cython.Build import cythonize
+
+
+PROTOS = ["src/sentence.proto", "src/feature_extractor.proto",
+          "src/task_spec.proto"]
+
+SOURCES = ["src/cld3.pyx",
+           "src/base.cc",
+           "src/cld_3/protos/src/feature_extractor.pb.cc",
+           "src/cld_3/protos/src/sentence.pb.cc",
+           "src/cld_3/protos/src/task_spec.pb.cc",
+           "src/embedding_feature_extractor.cc",
+           "src/embedding_network.cc",
+           "src/feature_extractor.cc",
+           "src/feature_types.cc",
+           "src/fml_parser.cc",
+           "src/lang_id_nn_params.cc",
+           "src/language_identifier_features.cc",
+           "src/nnet_language_identifier.cc",
+           "src/registry.cc",
+           "src/relevant_script_feature.cc",
+           "src/script_span/fixunicodevalue.cc",
+           "src/script_span/generated_entities.cc",
+           "src/script_span/generated_ulscript.cc",
+           "src/script_span/getonescriptspan.cc",
+           "src/script_span/offsetmap.cc",
+           "src/script_span/text_processing.cc",
+           "src/script_span/utf8statetable.cc",
+           "src/sentence_features.cc",
+           "src/task_context.cc",
+           "src/task_context_params.cc",
+           "src/unicodetext.cc",
+           "src/utils.cc",
+           "src/workspace.cc"]
+
+INCLUDES = ["/usr/local/include","./src", "./src/cld_3/protos/"]
+
+LIBRARIES = ["protobuf"]
+
+LONG_DESCRIPTION = \
+"""Python bindings for the CLD3 language classification library by Google."""
+
+CLASSIFIERS = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: 2",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: C++",
+    "Operating System :: OS Independent",
+    "Development Status :: 4 - Beta",
+    "Topic :: Text Processing :: Linguistic",
+    "Intended Audience :: Developers",]
+
+
+class BuildProtobuf(build):
+    def run(self):
+        if not path.exists("src/cld_3/protos"):
+            # Create protobufs dir
+            makedirs("src/cld_3/protos")
+
+        # Build protobuf stuff
+        command = ["protoc"]
+        command.extend(PROTOS)
+        command.append("--cpp_out={}".format(
+            path.join("src", "cld_3", "protos")))
+        subprocess.run(command, check=True)
+
+        build.run(self)
+
+
+ext = Extension(
+    "cld3",
+    sources=SOURCES,
+    include_dirs=INCLUDES,
+    libraries=LIBRARIES,
+    language="c++",
+    extra_compile_args=["-std=c++11"])
+
+setup(
+    name="cld3",
+    version="0.2.3",
+    cmdclass={"build": BuildProtobuf},
+    author="Google, Johannes Baiter, Elizabeth Myers",
+    author_email="elizabeth@interlinked.me",
+    description="CLD3 Python bindings",
+    long_description=LONG_DESCRIPTION,
+    license="Apache2",
+    keywords=["cld3", "cffi"],
+    url="https://github.com/iamthebot/cld3",
+    ext_modules=cythonize([ext]))
diff --git a/src/cld3.pyx b/src/cld3.pyx
new file mode 100644
index 0000000..ed4eb88
--- /dev/null
+++ b/src/cld3.pyx
@@ -0,0 +1,153 @@
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+
+from collections import namedtuple
+
+
+cdef extern from "nnet_language_identifier.h" namespace "chrome_lang_id::NNetLanguageIdentifier":
+    cdef struct Result:
+        string language
+        float probability
+        bint is_reliable
+        float proportion
+
+
+cdef extern from "nnet_language_identifier.h" namespace "chrome_lang_id":
+    cdef cppclass NNetLanguageIdentifier:
+        NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes);
+        Result FindLanguage(string &text)
+        vector[Result] FindTopNMostFreqLangs(string &text, int num_langs)
+        const char kUnknown[]
+
+
+LanguagePrediction = namedtuple("LanguagePrediction",
+                                ("language", "probability", "is_reliable",
+                                 "proportion"))
+
+cdef class LanguageIdentifier:
+    """
+    Basic Python API for using CLD3
+    """
+    cdef NNetLanguageIdentifier* model
+    cdef unsigned int min_bytes
+    cdef unsigned int max_bytes
+
+    def __init__(self, min_bytes=0, max_bytes=1024):
+        """
+        Initialize a LanguageIdentifier
+
+        :param min_bytes: The minimum number of bytes to look at for the prediction.
+        :param max_bytes: The maximum number of bytes to consider
+        """
+        self.min_bytes = min_bytes
+        self.max_bytes = max_bytes
+        self.model = new NNetLanguageIdentifier(self.min_bytes, self.max_bytes)
+
+    def get_language(self, unicode text):
+        """Get the most likely language for the given text.
+
+        The prediction is based on the first N bytes where N is the minumum between
+        the number of interchange valid UTF8 bytes and max_bytes. If N is less
+        than min_bytes long, then this function returns None.
+
+        If the language cannot be determined, None will be returned.
+        """
+        cdef Result res = self.model.FindLanguage(text.encode('utf8'))
+
+        if str(res.language) != self.model.kUnknown:
+            language = res.language.decode('utf8')
+            return LanguagePrediction(language, res.probability, res.is_reliable,
+                res.proportion)
+        else:
+            return None
+
+    def get_frequent_languages(
+        self,
+        unicode text,
+        int num_langs,
+    ):
+        """Find the most frequent languages in the given text.
+
+        Splits the input text (up to the first byte, if any, that is not
+        interchange valid UTF8) into spans based on the script, predicts a language
+        for each span, and returns a list storing the top num_langs most frequent
+        languages along with additional information (e.g., proportions). The number
+        of bytes considered for each span is the minimum between the size of the
+        span and max_bytes. If more languages are requested than what is available
+        in the input, then the list returned will only have the number of results
+        found. Also, if the size of the span is less than min_bytes long, then the
+        span is skipped. If the input text is too long, only the first 1000 bytes
+        are processed.
+        """
+        cdef vector[Result] results = self.model.FindTopNMostFreqLangs(
+            text.encode('utf8'),
+            num_langs
+        )
+        out = []
+        for res in results:
+            if str(res.language) != self.model.kUnknown:
+                language = res.language.decode('utf8')
+                out.append(LanguagePrediction(
+                    language, res.probability, res.is_reliable, res.proportion))
+        return out
+
+
+def get_language(unicode text, unsigned int min_bytes=0, unsigned int max_bytes=1000):
+    """Get the most likely language for the given text.
+
+    The prediction is based on the first N bytes where N is the minumum between
+    the number of interchange valid UTF8 bytes and max_bytes. If N is less
+    than min_bytes long, then this function returns None.
+
+    If the language cannot be determined, None will be returned.
+
+    This function requires initialization of a new identifier on each call so it's best
+    to use the LanguageIdentifier class instead for multiple calls
+    """
+    cdef NNetLanguageIdentifier* ident = new NNetLanguageIdentifier(min_bytes, max_bytes)
+    cdef Result res = ident.FindLanguage(text.encode('utf8'))
+    del ident
+    if str(res.language) != ident.kUnknown:
+        language = res.language.decode('utf8')
+        return LanguagePrediction(language, res.probability, res.is_reliable,
+            res.proportion)
+    else:
+        return None
+
+
+
+def get_frequent_languages(
+    unicode text,
+    unsigned int num_langs,
+    unsigned int min_bytes=0,
+    int max_bytes=1000
+):
+    """Find the most frequent languages in the given text.
+
+    Splits the input text (up to the first byte, if any, that is not
+    interchange valid UTF8) into spans based on the script, predicts a language
+    for each span, and returns a list storing the top num_langs most frequent
+    languages along with additional information (e.g., proportions). The number
+    of bytes considered for each span is the minimum between the size of the
+    span and max_bytes. If more languages are requested than what is available
+    in the input, then the list returned will only have the number of results
+    found. Also, if the size of the span is less than min_bytes long, then the
+    span is skipped. If the input text is too long, only the first 1000 bytes
+    are processed.
+
+    This function requires initialization of a new identifier on each call so it's best
+    to use the LanguageIdentifier class instead for multiple calls
+    """
+    cdef NNetLanguageIdentifier* ident = new NNetLanguageIdentifier(min_bytes, max_bytes)
+    cdef vector[Result] results = ident.FindTopNMostFreqLangs(
+        text.encode('utf8'),
+        num_langs
+    )
+    del ident
+    out = []
+    for res in results:
+        if str(res.language) != ident.kUnknown:
+            language = res.language.decode('utf8')
+            out.append(LanguagePrediction(
+                language, res.probability, res.is_reliable, res.proportion))
+    return out

From e58f92e1e502133ca5c90604ad63b450dd8e7059 Mon Sep 17 00:00:00 2001
From: Alfredo Luque <alfredo.luque@airbnb.com>
Date: Fri, 30 Aug 2019 13:40:56 -0700
Subject: [PATCH 12/13] update readme

---
 README.md | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 8600833..77f2659 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,15 @@
-# Compact Language Detector v3 (CLD3)
+# Compact Language Detector v3 (CLD3) Python Edition
 
 * [Model](#model)
 * [Installation](#installation)
 * [Contact](#contact)
 * [Credits](#credits)
 
+### Notes
+This is an effort to fix issues with the initial Python fork at https://github.com/Elizafox/cld3
+including memory leaks and no reuse of a Language model for multiple calls. This also pulls in much newer cld3 upstream code.
+
+
 ### Model
 
 CLD3 is a neural network model for language identification. This package
@@ -27,29 +32,21 @@ To get a language prediction for the input text, we simply perform a forward
 ![Figure](model.png "CLD3")
 
 ### Installation
-CLD3 is designed to run in the Chrome browser, so it relies on code in
-[Chromium](http://www.chromium.org/).
-The steps for building and running the demo of the language detection model are:
-
-- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the
-  Chromium repository.
-- copy the code to `//third_party/cld_3`
-- Uncomment `language_identifier_main` executable in `src/BUILD.gn`.
-- build and run the model using the commands:
-
-```shell
-gn gen out/Default
-ninja -C out/Default third_party/cld_3/src/src:language_identifier_main
-out/Default/language_identifier_main
+Building the Python wheel requires the protobuf compiler and its headers to be installed.
+If you run into issues with protobufs not compiling, just go into the `src` directory and run
+
+```
+mkdir -p cld_3/protos
+protoc --cpp_out=cld_3/protos *.proto
 ```
-### Bugs and Feature Requests
 
-Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests.
+To generate a python wheel (from the root of this repo):
 
-### Announcements and Discussion
+```
+python setup.py bdist_wheel
+```
 
-For announcements regarding major updates as well as general discussion list, please subscribe to:
-[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users)
+Builds have been tested with GCC9.0 on Ubuntu 18.04 and Apple Clang 11.0.0 on OSX 10.15 (Catalina Beta)
 
 ### Credits
 
@@ -71,3 +68,5 @@ Original authors of the code in this package include (in alphabetical order):
 * Slav Petrov
 * Stefan Istrate
 * Terry Koo
+
+and Elizabeth Myers for the original Python bindings

From 93b53a87e138ff2d78ffaf52a01ac01ec04570d9 Mon Sep 17 00:00:00 2001
From: Alfredo Luque <alfredo.luque@airbnb.com>
Date: Fri, 30 Aug 2019 15:46:57 -0700
Subject: [PATCH 13/13] fix protobuf

---
 setup.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index bc99068..332f46f 100644
--- a/setup.py
+++ b/setup.py
@@ -9,14 +9,14 @@
 from Cython.Build import cythonize
 
 
-PROTOS = ["src/sentence.proto", "src/feature_extractor.proto",
-          "src/task_spec.proto"]
+PROTOS = ["sentence.proto", "feature_extractor.proto",
+          "task_spec.proto"]
 
 SOURCES = ["src/cld3.pyx",
            "src/base.cc",
-           "src/cld_3/protos/src/feature_extractor.pb.cc",
-           "src/cld_3/protos/src/sentence.pb.cc",
-           "src/cld_3/protos/src/task_spec.pb.cc",
+           "src/cld_3/protos/feature_extractor.pb.cc",
+           "src/cld_3/protos/sentence.pb.cc",
+           "src/cld_3/protos/task_spec.pb.cc",
            "src/embedding_feature_extractor.cc",
            "src/embedding_network.cc",
            "src/feature_extractor.cc",
@@ -41,7 +41,7 @@
            "src/utils.cc",
            "src/workspace.cc"]
 
-INCLUDES = ["/usr/local/include","./src", "./src/cld_3/protos/"]
+INCLUDES = ["/usr/local/include","/src", "./src/cld_3/protos/"]
 
 LIBRARIES = ["protobuf"]
 
@@ -70,8 +70,8 @@ def run(self):
         command = ["protoc"]
         command.extend(PROTOS)
         command.append("--cpp_out={}".format(
-            path.join("src", "cld_3", "protos")))
-        subprocess.run(command, check=True)
+            path.join("cld_3", "protos")))
+        subprocess.run(command, check=True, cwd='src')
 
         build.run(self)
 
@@ -84,6 +84,7 @@ def run(self):
     language="c++",
     extra_compile_args=["-std=c++11"])
 
+
 setup(
     name="cld3",
     version="0.2.3",