Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update code from iamthebot/cld3 and google/cld3 #1

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build
69 changes: 69 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# This cmake scripts only builds a static cld3 lib and the unittests.

project(cld3)

# Old versions of cmake dont search/find protobuf lite
cmake_minimum_required(VERSION 3.9)

find_package(Protobuf REQUIRED)
message(STATUS "Protobuf_FOUND= ${Protobuf_FOUND}")
message(STATUS "Protobuf_VERSION= ${Protobuf_VERSION}")
message(WARNING "Protobuf 2.5 and CLD3 seems happy together. This script does NOT check if your verison of protobuf is compatible.")
message(STATUS "Protobuf_LIBRARIES= ${Protobuf_LIBRARIES}")
message(STATUS "Protobuf_LITE_LIBRARIES= ${Protobuf_LITE_LIBRARIES}") # Usually /usr/lib64/libprotobuf-lite.so

# By default, protobuf_generate_cpp generates pb.* files directy in the cmake build dir.
# But CLD3 sources have been coded using hard coded pathes to cld_3/protos/*.pb.h.
# So *.pb.h must be output to cld_3/protos.
# For that, let's use a custom my_protobuf_generate_cpp:
include(${CMAKE_CURRENT_SOURCE_DIR}/misc/myprotobuf.cmake)
my_protobuf_generate_cpp(cld_3/protos PROTO_SRCS PROTO_HDRS src/feature_extractor.proto src/sentence.proto src/task_spec.proto)
message(STATUS "PROTO_HDRS= ${PROTO_HDRS}")

add_definitions(-fPIC) # Position Independant Code
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
add_definitions(-std=c++11) # Needed for std::to_string(), ...

include_directories(${CMAKE_CURRENT_BINARY_DIR}) # needed to include generated pb headers

add_library(${PROJECT_NAME}
${PROTO_SRCS} ${PROTO_HDRS}
src/base.cc
src/embedding_feature_extractor.cc
src/embedding_network.cc
src/feature_extractor.cc
src/feature_extractor.h
src/feature_types.cc
src/fml_parser.cc
src/language_identifier_features.cc
src/lang_id_nn_params.cc
src/nnet_language_identifier.cc
src/registry.cc
src/relevant_script_feature.cc
src/sentence_features.cc
src/task_context.cc
src/task_context_params.cc
src/unicodetext.cc
src/utils.cc
src/workspace.cc

src/script_span/generated_entities.cc
src/script_span/getonescriptspan.cc
src/script_span/getonescriptspan.h
src/script_span/getonescriptspan_test.cc
src/script_span/utf8statetable.cc
src/script_span/offsetmap.cc
src/script_span/text_processing.cc
src/script_span/text_processing.h
src/script_span/fixunicodevalue.cc
)

# unit tests exec:
add_executable(language_identifier_main src/language_identifier_main.cc)
target_link_libraries(language_identifier_main cld3 ${Protobuf_LITE_LIBRARIES})

add_executable(getonescriptspan_test src/script_span/getonescriptspan_test.cc)
target_link_libraries(getonescriptspan_test cld3 ${Protobuf_LITE_LIBRARIES})

add_executable(language_identifier_features_test src/language_identifier_features_test.cc)
target_link_libraries(language_identifier_features_test cld3 ${Protobuf_LITE_LIBRARIES})
39 changes: 19 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# Compact Language Detector v3 (CLD3)
# Compact Language Detector v3 (CLD3) Python Edition

* [Model](#model)
* [Installation](#installation)
* [Contact](#contact)
* [Credits](#credits)

### Notes
This is an effort to fix issues with the initial Python fork at https://github.com/Elizafox/cld3
including memory leaks and no reuse of a Language model for multiple calls. This also pulls in much newer cld3 upstream code.


### Model

CLD3 is a neural network model for language identification. This package
Expand All @@ -27,29 +32,21 @@ To get a language prediction for the input text, we simply perform a forward
![Figure](model.png "CLD3")

### Installation
CLD3 is designed to run in the Chrome browser, so it relies on code in
[Chromium](http://www.chromium.org/).
The steps for building and running the demo of the language detection model are:

- [check out](http://www.chromium.org/developers/how-tos/get-the-code) the
Chromium repository.
- copy the code to `//third_party/cld_3`
- Uncomment `language_identifier_main` executable in `src/BUILD.gn`.
- build and run the model using the commands:

```shell
gn gen out/Default
ninja -C out/Default third_party/cld_3/src/src:language_identifier_main
out/Default/language_identifier_main
Building the Python wheel requires the protobuf compiler and its headers to be installed.
If you run into issues with protobufs not compiling, just go into the `src` directory and run

```
mkdir -p cld_3/protos
protoc --cpp_out=cld_3/protos *.proto
```
### Bugs and Feature Requests

Open a [GitHub issue](https://github.com/google/cld3/issues) for this repository to file bugs and feature requests.
To generate a python wheel (from the root of this repo):

### Announcements and Discussion
```
python setup.py bdist_wheel
```

For announcements regarding major updates as well as general discussion list, please subscribe to:
[cld3-users@googlegroups.com](https://groups.google.com/forum/#!forum/cld3-users)
Builds have been tested with GCC9.0 on Ubuntu 18.04 and Apple Clang 11.0.0 on OSX 10.15 (Catalina Beta)

### Credits

Expand All @@ -71,3 +68,5 @@ Original authors of the code in this package include (in alphabetical order):
* Slav Petrov
* Stefan Istrate
* Terry Koo

and Elizabeth Myers for the original Python bindings
58 changes: 58 additions & 0 deletions misc/myprotobuf.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Special PROTOBUF_GENERATE_CPP which allows to set the output folder:
# From https://stackoverflow.com/users/1600278/akira-okumura

function(MY_PROTOBUF_GENERATE_CPP PATH SRCS HDRS)
if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
return()
endif()

if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
# Create an include path for each file specified
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(ABS_PATH ${ABS_FIL} PATH)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif()

if(DEFINED PROTOBUF_IMPORT_DIRS)
foreach(DIR ${PROTOBUF_IMPORT_DIRS})
get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
if(${_contains_already} EQUAL -1)
list(APPEND _protobuf_include_path -I ${ABS_PATH})
endif()
endforeach()
endif()

set(${SRCS})
set(${HDRS})
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)

list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc")
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h")

execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${PATH})

add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.cc"
"${CMAKE_CURRENT_BINARY_DIR}/${PATH}/${FIL_WE}.pb.h"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR}/${PATH} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL}
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
VERBATIM )
endforeach()

set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
endfunction()
99 changes: 99 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python

import subprocess
from os import path, makedirs

from setuptools import setup, Extension
from distutils.command.build import build

from Cython.Build import cythonize


PROTOS = ["sentence.proto", "feature_extractor.proto",
"task_spec.proto"]

SOURCES = ["src/cld3.pyx",
"src/base.cc",
"src/cld_3/protos/feature_extractor.pb.cc",
"src/cld_3/protos/sentence.pb.cc",
"src/cld_3/protos/task_spec.pb.cc",
"src/embedding_feature_extractor.cc",
"src/embedding_network.cc",
"src/feature_extractor.cc",
"src/feature_types.cc",
"src/fml_parser.cc",
"src/lang_id_nn_params.cc",
"src/language_identifier_features.cc",
"src/nnet_language_identifier.cc",
"src/registry.cc",
"src/relevant_script_feature.cc",
"src/script_span/fixunicodevalue.cc",
"src/script_span/generated_entities.cc",
"src/script_span/generated_ulscript.cc",
"src/script_span/getonescriptspan.cc",
"src/script_span/offsetmap.cc",
"src/script_span/text_processing.cc",
"src/script_span/utf8statetable.cc",
"src/sentence_features.cc",
"src/task_context.cc",
"src/task_context_params.cc",
"src/unicodetext.cc",
"src/utils.cc",
"src/workspace.cc"]

INCLUDES = ["/usr/local/include","/src", "./src/cld_3/protos/"]

LIBRARIES = ["protobuf"]

LONG_DESCRIPTION = \
"""Python bindings for the CLD3 language classification library by Google."""

CLASSIFIERS = [
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 3",
"Programming Language :: C++",
"Operating System :: OS Independent",
"Development Status :: 4 - Beta",
"Topic :: Text Processing :: Linguistic",
"Intended Audience :: Developers",]


class BuildProtobuf(build):
def run(self):
if not path.exists("src/cld_3/protos"):
# Create protobufs dir
makedirs("src/cld_3/protos")

# Build protobuf stuff
command = ["protoc"]
command.extend(PROTOS)
command.append("--cpp_out={}".format(
path.join("cld_3", "protos")))
subprocess.run(command, check=True, cwd='src')

build.run(self)


ext = Extension(
"cld3",
sources=SOURCES,
include_dirs=INCLUDES,
libraries=LIBRARIES,
language="c++",
extra_compile_args=["-std=c++11"])


setup(
name="cld3",
version="0.2.3",
cmdclass={"build": BuildProtobuf},
author="Google, Johannes Baiter, Elizabeth Myers",
author_email="elizabeth@interlinked.me",
description="CLD3 Python bindings",
long_description=LONG_DESCRIPTION,
license="Apache2",
keywords=["cld3", "cffi"],
url="https://github.com/iamthebot/cld3",
ext_modules=cythonize([ext]))
2 changes: 0 additions & 2 deletions src/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ static_library("cld_3") {
]
public_deps = [
"//third_party/protobuf:protobuf_lite",
]
deps = [
":protos",
]
}
Expand Down
Loading