-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add FasterTokenizer Operator (#34491)
Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent. * support the text string as an input Tensor * support the "VOCAB"unordered_map<wstring, int> as an input Tensor to lookup tokens * Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. * It first applies basic tokenization, followed by wordpiece tokenization.
- Loading branch information
1 parent
873ee4e
commit 3f2d6a3
Showing
53 changed files
with
3,604 additions
and
157 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
INCLUDE(ExternalProject) | ||
|
||
SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) | ||
SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) | ||
# As we add extra features for utf8proc, we use the non-official repo | ||
SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) | ||
SET(UTF8PROC_TAG v2.6.1) | ||
|
||
IF(WIN32) | ||
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") | ||
add_definitions(-DUTF8PROC_STATIC) | ||
ELSE(WIN32) | ||
SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") | ||
ENDIF(WIN32) | ||
|
||
INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) | ||
|
||
ExternalProject_Add( | ||
extern_utf8proc | ||
${EXTERNAL_PROJECT_LOG_ARGS} | ||
${SHALLOW_CLONE} | ||
GIT_REPOSITORY ${UTF8PROC_REPOSITORY} | ||
GIT_TAG ${UTF8PROC_TAG} | ||
PREFIX ${UTF8PROC_PREFIX_DIR} | ||
UPDATE_COMMAND "" | ||
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} | ||
-DBUILD_SHARED=ON | ||
-DBUILD_STATIC=ON | ||
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} | ||
-DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} | ||
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} | ||
BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} | ||
) | ||
|
||
ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) | ||
SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) | ||
ADD_DEPENDENCIES(utf8proc extern_utf8proc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include <utf8proc.h> | ||
|
||
#include <exception> | ||
|
||
#include "glog/logging.h" | ||
#include "paddle/fluid/framework/string_array.h" | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
||
std::wstring_convert<std::codecvt_utf8<wchar_t>> kConverter; | ||
|
||
// Convert the std::string type to the std::wstring type. | ||
bool ConvertStrToWstr(const std::string& src, std::wstring* res) { | ||
try { | ||
*res = kConverter.from_bytes(src); | ||
} catch (std::range_error& e) { | ||
VLOG(3) << "The string " << src << " was converted to unicode failedly! "; | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
// Convert the std::wstring type to the std::string type. | ||
void ConvertWstrToStr(const std::wstring& src, std::string* res) { | ||
*res = kConverter.to_bytes(src); | ||
} | ||
|
||
// Normalization Form Canonical Decomposition. | ||
void NFD(const std::string& s, std::string* ret) { | ||
*ret = ""; | ||
char* result = reinterpret_cast<char*>( | ||
utf8proc_NFD(reinterpret_cast<const unsigned char*>(s.c_str()))); | ||
if (result) { | ||
*ret = std::move(std::string(result)); | ||
free(result); | ||
} | ||
} | ||
|
||
// Write the data which is type of | ||
// std::unordered_map<std::string, int32_t> to ostream. | ||
void StringMapToStream(std::ostream& os, | ||
const std::unordered_map<std::string, int32_t>& data) { | ||
{ | ||
// firstly write the data size. | ||
size_t t = data.size(); | ||
os.write(reinterpret_cast<const char*>(&t), sizeof(t)); | ||
} | ||
{ | ||
// then write the data | ||
for (auto it = data.begin(); it != data.end(); ++it) { | ||
std::string token = it->first; | ||
int32_t token_id = it->second; | ||
// write the token | ||
size_t length = token.size(); | ||
os.write(reinterpret_cast<const char*>(&length), sizeof(length)); | ||
os.write(token.c_str(), length); | ||
// write the token_id | ||
os.write(reinterpret_cast<const char*>(&token_id), sizeof(token_id)); | ||
} | ||
} | ||
} | ||
|
||
// Read the data which is type of | ||
// std::unordered_map<td::string, int32_t> from istream. | ||
void StringMapFromStream(std::istream& is, | ||
std::unordered_map<std::string, int32_t>* data) { | ||
// first read the map size | ||
size_t map_size; | ||
is.read(reinterpret_cast<char*>(&map_size), sizeof(map_size)); | ||
data->reserve(map_size); | ||
// then read the data | ||
for (size_t i = 0; i < map_size; ++i) { | ||
// read the token | ||
size_t token_length; | ||
is.read(reinterpret_cast<char*>(&token_length), sizeof(token_length)); | ||
char* tmp = new char[token_length]; | ||
is.read(tmp, token_length); | ||
std::string token(tmp, tmp + token_length); | ||
delete[] tmp; | ||
// read the token_id | ||
int32_t token_id; | ||
is.read(reinterpret_cast<char*>(&token_id), sizeof(token_id)); | ||
|
||
data->emplace(token, token_id); | ||
} | ||
} | ||
|
||
} // namespace framework | ||
} // namespace paddle |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#pragma once | ||
|
||
#include <codecvt> | ||
#include <iostream> | ||
#include <locale> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <vector> | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
||
using String = std::string; | ||
using Strings = std::vector<std::string>; | ||
using Vocab = std::unordered_map<std::wstring, std::int32_t>; | ||
|
||
// Convert the std::string type to the std::string type. | ||
bool ConvertStrToWstr(const std::string& src, std::wstring* res); | ||
// Convert the std::wstring type to the std::string type. | ||
void ConvertWstrToStr(const std::wstring& src, std::string* res); | ||
// Normalization Form Canonical Decomposition. | ||
void NFD(const std::string& s, std::string* ret); | ||
|
||
// Write the data which is type of | ||
// std::unordered_map<td::string, int32_t> to ostream. | ||
void StringMapToStream(std::ostream& os, | ||
const std::unordered_map<std::string, int32_t>& data); | ||
|
||
// Read the data which is type of | ||
// std::unordered_map<td::string, int32_t> from istream. | ||
void StringMapFromStream(std::istream& is, | ||
std::unordered_map<std::string, int32_t>* data); | ||
} // namespace framework | ||
} // namespace paddle |
Oops, something went wrong.