Skip to content

Commit

Permalink
Add FasterTokenizer Operator (#34491)
Browse files Browse the repository at this point in the history
Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent.

* support the text string as an input Tensor
* support the "VOCAB"unordered_map<wstring, int> as an input Tensor to lookup tokens
* Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization.
* It first applies basic tokenization, followed by wordpiece tokenization.
  • Loading branch information
Steffy-zxf authored Oct 20, 2021
1 parent 873ee4e commit 3f2d6a3
Show file tree
Hide file tree
Showing 53 changed files with 3,604 additions and 157 deletions.
51 changes: 51 additions & 0 deletions cmake/external/utf8proc.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

INCLUDE(ExternalProject)

SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc)
SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc)
# As we add extra features for utf8proc, we use the non-official repo
SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git)
SET(UTF8PROC_TAG v2.6.1)

IF(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
add_definitions(-DUTF8PROC_STATIC)
ELSE(WIN32)
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
ENDIF(WIN32)

INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include)

ExternalProject_Add(
extern_utf8proc
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
GIT_REPOSITORY ${UTF8PROC_REPOSITORY}
GIT_TAG ${UTF8PROC_TAG}
PREFIX ${UTF8PROC_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DBUILD_SHARED=ON
-DBUILD_STATIC=ON
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES}
)

ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES})
ADD_DEPENDENCIES(utf8proc extern_utf8proc)
5 changes: 5 additions & 0 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)

set(dst_dir "${DST}/third_party/install/utf8proc")
copy(${TARGET}
SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)

if (WITH_CRYPTO)
set(dst_dir "${DST}/third_party/install/cryptopp")
copy(${TARGET}
Expand Down
4 changes: 4 additions & 0 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool
include(external/dlpack) # download dlpack
include(external/xxhash) # download, build, install xxhash
include(external/warpctc) # download, build, install warpctc
include(external/utf8proc) # download, build, install utf8proc

list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc)
include(external/lapack) # download, build, install lapack

list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ proto_library(data_feed_proto SRCS data_feed.proto)
proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto)

cc_library(string_array SRCS string_array.cc DEPS utf8proc)

cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
if(WITH_GPU)
Expand Down
8 changes: 6 additions & 2 deletions paddle/fluid/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,

if (var->Persistable()) {
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());

VLOG(3) << "Initialize Variable " << var->Name();
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
<< " global, which pointer is " << ptr << " type is "
<< static_cast<int>(var->GetType());
} else {
auto* ptr = scope->Var(var->Name());
InitializeVariable(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
<< " locally, which pointer is " << ptr << "Variable Type "
<< static_cast<int>(var->GetType());
}
}
} else {
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/executor_gc_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope,
for (auto &t : *lod_tensor_arr) {
garbages.emplace_back(t.MoveMemoryHolder());
}
} else if (var->IsType<Strings>()) {
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Type %s of variable %s is not supported eager deletion.",
Expand Down
20 changes: 18 additions & 2 deletions paddle/fluid/framework/feed_fetch_method.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */

#include <string>

#include <boost/variant.hpp>
#include "glog/logging.h"

namespace paddle {
Expand All @@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
auto& val = BOOST_GET(LoDTensor, feed_inputs[index]);
val.ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
val.set_lod(input.lod());
}

void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index] = input;
}

FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/feed_fetch_method.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License. */

#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/string_array.h"

namespace paddle {
namespace framework {
Expand All @@ -28,6 +29,9 @@ class Scope;
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index);

void SetFeedVariable(Scope* scope, const Strings& input,
const std::string& var_name, size_t index);

FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index);

Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/framework/feed_fetch_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <vector>

#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/string_array.h"
#include "paddle/fluid/platform/variant.h"

namespace paddle {
namespace framework {
using FeedType = LoDTensor;
using FeedType = boost::variant<LoDTensor, Strings>;
using FeedList = std::vector<FeedType>;

using FetchType = boost::variant<LoDTensor, LoDTensorArray>;
Expand All @@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) {
return false;
}

inline bool data_is_string_tensor(const FeedType &data) {
if (data.type() == typeid(Strings)) {
return true;
}
return false;
}

static const char kFeedOpType[] = "feed";
static const char kFetchOpType[] = "fetch";

Expand Down
9 changes: 9 additions & 0 deletions paddle/fluid/framework/framework.proto
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ message VarType {
// in operators like nccl_op
RAW = 17;
TUPLE = 18;

STRING = 25;
STRINGS = 26;
VOCAB = 27;
FEED_LIST = 28;
}

required Type type = 1;
Expand Down Expand Up @@ -175,6 +180,10 @@ message VarType {

message Tuple { repeated Type element_type = 1; }
optional Tuple tuple = 7;

optional TensorDesc string = 8;
optional TensorDesc strings = 9;
optional TensorDesc vocab = 10;
}

message VarDesc {
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name,
} else {
return var->Get<SelectedRows>().GetCompleteDims();
}
} else if (var->IsType<Strings>()) {
return DDim({static_cast<int64_t>(var->Get<Strings>().size())});
} else {
return DDim({-1});
}
Expand Down Expand Up @@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
} else {
return DataTypeToString(tensor.type());
}
} else if (var->IsType<Strings>()) {
return "strings";
} else {
return "";
}
Expand Down
104 changes: 104 additions & 0 deletions paddle/fluid/framework/string_array.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <utf8proc.h>

#include <exception>

#include "glog/logging.h"
#include "paddle/fluid/framework/string_array.h"

namespace paddle {
namespace framework {

std::wstring_convert<std::codecvt_utf8<wchar_t>> kConverter;

// Convert the std::string type to the std::wstring type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res) {
try {
*res = kConverter.from_bytes(src);
} catch (std::range_error& e) {
VLOG(3) << "The string " << src << " was converted to unicode failedly! ";
return false;
}
return true;
}

// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res) {
*res = kConverter.to_bytes(src);
}

// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret) {
*ret = "";
char* result = reinterpret_cast<char*>(
utf8proc_NFD(reinterpret_cast<const unsigned char*>(s.c_str())));
if (result) {
*ret = std::move(std::string(result));
free(result);
}
}

// Write the data which is type of
// std::unordered_map<std::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data) {
{
// firstly write the data size.
size_t t = data.size();
os.write(reinterpret_cast<const char*>(&t), sizeof(t));
}
{
// then write the data
for (auto it = data.begin(); it != data.end(); ++it) {
std::string token = it->first;
int32_t token_id = it->second;
// write the token
size_t length = token.size();
os.write(reinterpret_cast<const char*>(&length), sizeof(length));
os.write(token.c_str(), length);
// write the token_id
os.write(reinterpret_cast<const char*>(&token_id), sizeof(token_id));
}
}
}

// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data) {
// first read the map size
size_t map_size;
is.read(reinterpret_cast<char*>(&map_size), sizeof(map_size));
data->reserve(map_size);
// then read the data
for (size_t i = 0; i < map_size; ++i) {
// read the token
size_t token_length;
is.read(reinterpret_cast<char*>(&token_length), sizeof(token_length));
char* tmp = new char[token_length];
is.read(tmp, token_length);
std::string token(tmp, tmp + token_length);
delete[] tmp;
// read the token_id
int32_t token_id;
is.read(reinterpret_cast<char*>(&token_id), sizeof(token_id));

data->emplace(token, token_id);
}
}

} // namespace framework
} // namespace paddle
48 changes: 48 additions & 0 deletions paddle/fluid/framework/string_array.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <codecvt>
#include <iostream>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>

namespace paddle {
namespace framework {

using String = std::string;
using Strings = std::vector<std::string>;
using Vocab = std::unordered_map<std::wstring, std::int32_t>;

// Convert the std::string type to the std::string type.
bool ConvertStrToWstr(const std::string& src, std::wstring* res);
// Convert the std::wstring type to the std::string type.
void ConvertWstrToStr(const std::wstring& src, std::string* res);
// Normalization Form Canonical Decomposition.
void NFD(const std::string& s, std::string* ret);

// Write the data which is type of
// std::unordered_map<td::string, int32_t> to ostream.
void StringMapToStream(std::ostream& os,
const std::unordered_map<std::string, int32_t>& data);

// Read the data which is type of
// std::unordered_map<td::string, int32_t> from istream.
void StringMapFromStream(std::istream& is,
std::unordered_map<std::string, int32_t>* data);
} // namespace framework
} // namespace paddle
Loading

0 comments on commit 3f2d6a3

Please sign in to comment.