pytorch · lxning · Feb 27, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 project(torchserve_cpp VERSION 0.1)
 
 set(CMAKE_CXX_STANDARD 17)
@@ -30,7 +30,7 @@ find_package(folly REQUIRED)
 find_package(fmt REQUIRED)
 find_package(gflags REQUIRED)
 find_package(Torch REQUIRED)
-find_package(yaml-cpp REQUIRED NO_CMAKE_PATH)
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
 include_directories(${TORCH_INCLUDE_DIRS})

diff --git a/cpp/README.md b/cpp/README.md
@@ -2,6 +2,7 @@
 ## Requirements
 * C++17
 * GCC version: gcc-9
+* cmake version: 3.18+
 ## Installation and Running TorchServe CPP
 
 ### Install dependencies

diff --git a/cpp/build.sh b/cpp/build.sh
@@ -155,32 +155,14 @@ function install_yaml_cpp() {
   cd "$BWD" || exit
 }
 
-function install_sentencepiece() {
-  SENTENCEPIECE_SRC_DIR=$BASE_DIR/third-party/sentencepiece
-  SENTENCEPIECE_BUILD_DIR=$DEPS_DIR/sentencepiece-build
-
-  if [ ! -d "$SENTENCEPIECE_SRC_DIR" ] ; then
-    echo -e "${COLOR_GREEN}[ INFO ] Cloning sentencepiece repo ${COLOR_OFF}"
-    git clone https://github.com/google/sentencepiece.git "$SENTENCEPIECE_SRC_DIR"
-    cd $SENTENCEPIECE_SRC_DIR
-    git checkout tags/v0.1.99
-  fi
-
-  if [ ! -d "$SENTENCEPIECE_BUILD_DIR" ] ; then
-    echo -e "${COLOR_GREEN}[ INFO ] Building sentencepiece ${COLOR_OFF}"
-
-    mkdir $SENTENCEPIECE_BUILD_DIR
-    cd $SENTENCEPIECE_BUILD_DIR
-    cmake $SENTENCEPIECE_SRC_DIR
-    make -i $(nproc)
-    if [ "$PLATFORM" = "Linux" ]; then
-      sudo make install
-      sudo ldconfig -v
-    elif [ "$PLATFORM" = "Mac" ]; then
-      make install
-    fi
-
-    echo -e "${COLOR_GREEN}[ INFO ] sentencepiece is installed ${COLOR_OFF}"
+function install_tokenizer_cpp() {
+  TOKENIZERS_CPP_SRC_DIR=$BASE_DIR/third-party/tokenizers-cpp
+
+  if [ ! -d "$TOKENIZERS_CPP_SRC_DIR" ] ; then
+    echo -e "${COLOR_GREEN}[ INFO ] Cloning tokenizers-cpp repo ${COLOR_OFF}"
+    git clone https://github.com/mlc-ai/tokenizers-cpp.git "$TOKENIZERS_CPP_SRC_DIR"
+    cd $TOKENIZERS_CPP_SRC_DIR
+    git submodule update --init --recursive
   fi
 
   cd "$BWD" || exit
@@ -208,14 +190,28 @@ function prepare_test_files() {
   if [ ! -f "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin" ]; then
     wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -O "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin"
   fi
-  if [ ! -f "${EX_DIR}/aot_inductor/llama_handler/stories15M.so" ]; then
-    local HANDLER_DIR=${EX_DIR}/aot_inductor/llama_handler/
-    if [ ! -f "${HANDLER_DIR}/stories15M.pt" ]; then
-      wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt?download=true -O "${HANDLER_DIR}/stories15M.pt"
+  # PT2.2 torch.expport does not support Mac
+  if [ "$PLATFORM" = "Linux" ]; then
+    if [ ! -f "${EX_DIR}/aot_inductor/llama_handler/stories15M.so" ]; then
+      local HANDLER_DIR=${EX_DIR}/aot_inductor/llama_handler/
+      if [ ! -f "${HANDLER_DIR}/stories15M.pt" ]; then
+        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt?download=true -O "${HANDLER_DIR}/stories15M.pt"
+      fi
+      local LLAMA_SO_DIR=${BASE_DIR}/third-party/llama2.so/
+      PYTHONPATH=${LLAMA_SO_DIR}:${PYTHONPATH} python ${BASE_DIR}/../examples/cpp/aot_inductor/llama2/compile.py --checkpoint ${HANDLER_DIR}/stories15M.pt ${HANDLER_DIR}/stories15M.so
+    fi
+    if [ ! -f "${EX_DIR}/aot_inductor/bert_handler/bert-seq.so" ]; then
+      pip install transformers
+      local HANDLER_DIR=${EX_DIR}/aot_inductor/bert_handler/
+      export TOKENIZERS_PARALLELISM=false
+      cd ${BASE_DIR}/../examples/cpp/aot_inductor/bert/
+      python aot_compile_export.py
+      mv bert-seq.so ${HANDLER_DIR}/bert-seq.so
+      mv Transformer_model/tokenizer.json ${HANDLER_DIR}/tokenizer.json
+      export TOKENIZERS_PARALLELISM=""
     fi
-    local LLAMA_SO_DIR=${BASE_DIR}/third-party/llama2.so/
-    PYTHONPATH=${LLAMA_SO_DIR}:${PYTHONPATH} python ${BASE_DIR}/../examples/cpp/aot_inductor/llama2/compile.py --checkpoint ${HANDLER_DIR}/stories15M.pt ${HANDLER_DIR}/stories15M.so
   fi
+  cd "$BWD" || exit
 }
 
 function build() {
@@ -401,7 +397,7 @@ install_folly
 install_kineto
 install_libtorch
 install_yaml_cpp
-install_sentencepiece
+install_tokenizer_cpp
 build_llama_cpp
 prepare_test_files
 build

diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt
@@ -1,8 +1,13 @@
 
 add_subdirectory("../../../examples/cpp/babyllama/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/babyllama/babyllama_handler/")
 
-add_subdirectory("../../../examples/cpp/aot_inductor/llama2/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/aot_inductor/llama_handler/")
-
 add_subdirectory("../../../examples/cpp/llamacpp/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/llamacpp/llamacpp_handler/")
 
 add_subdirectory("../../../examples/cpp/mnist/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/mnist/mnist_handler/")
+
+# PT2.2 torch.expport does not support Mac
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  add_subdirectory("../../../examples/cpp/aot_inductor/llama2/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/aot_inductor/llama_handler/")
+
+  add_subdirectory("../../../examples/cpp/aot_inductor/bert" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/aot_inductor/bert_handler/")
+endif()
diff --git a/cpp/src/utils/CMakeLists.txt b/cpp/src/utils/CMakeLists.txt
@@ -12,7 +12,12 @@ list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/registry.cc)
 add_library(ts_utils SHARED ${TS_UTILS_SOURCE_FILES})
 target_include_directories(ts_utils PUBLIC ${TS_UTILS_SRC_DIR})
 target_include_directories(ts_utils PRIVATE ${Boost_INCLUDE_DIRS})
-target_link_libraries(ts_utils ${FOLLY_LIBRARIES} ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} yaml-cpp::yaml-cpp)
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+  target_link_libraries(ts_utils ${FOLLY_LIBRARIES} ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} yaml-cpp::yaml-cpp)
+else()
+  target_link_libraries(ts_utils ${FOLLY_LIBRARIES} ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} yaml-cpp)
+endif()
+
 install(TARGETS ts_utils DESTINATION ${torchserve_cpp_SOURCE_DIR}/_build/libs)
 
 list(APPEND FOO_SOURCE_FILES ${TS_UTILS_SRC_DIR}/ifoo.hh)

diff --git a/cpp/test/examples/examples_test.cc b/cpp/test/examples/examples_test.cc
@@ -59,3 +59,26 @@ TEST_F(ModelPredictTest, TestLoadPredictLlamaCppHandler) {
           base_dir + "llamacpp_handler", "llamacpp", -1, "", "", 1, false),
       base_dir + "llamacpp_handler", base_dir + "prompt.txt", "llm_ts", 200);
 }
+
+TEST_F(ModelPredictTest, TestLoadPredictAotInductorBertHandler) {
+  std::string base_dir = "_build/test/resources/examples/aot_inductor/";
+  std::string file1 = base_dir + "bert_handler/bert-seq.so";
+  std::string file2 = base_dir + "bert_handler/tokenizer.json";
+
+  std::ifstream f1(file1);
+  std::ifstream f2(file2);
+
+  if (!f1.good() || !f2.good())
+    GTEST_SKIP() << "Skipping TestLoadPredictAotInductorBertHandler because "
+                    "of missing files: "
+                 << file1 << " or " << file2;
+
+  this->LoadPredict(
+    std::make_shared<torchserve::LoadModelRequest>(
+      base_dir + "bert_handler", "bert_aot",
+      torch::cuda::is_available() ? 0 : -1, "", "", 1, false),
+      base_dir + "bert_handler",
+      base_dir + "bert_handler/sample_text.txt",
+      "bert_ts",
+      200);
+}
diff --git a/cpp/test/resources/examples/aot_inductor/bert_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/aot_inductor/bert_handler/MAR-INF/MANIFEST.json
@@ -0,0 +1,11 @@
+{
+  "createdOn": "12/02/2024 21:09:26",
+  "runtime": "LSP",
+  "model": {
+    "modelName": "bertcppaot",
+    "handler": "libbert_handler:BertCppHandler",
+    "modelVersion": "1.0",
+    "configFile": "model-config.yaml"
+  },
+  "archiverVersion": "0.9.0"
+}
diff --git a/cpp/test/resources/examples/aot_inductor/bert_handler/index_to_name.json b/cpp/test/resources/examples/aot_inductor/bert_handler/index_to_name.json
@@ -0,0 +1,4 @@
+{
+ "0":"Not Accepted",
+ "1":"Accepted"
+}
diff --git a/cpp/test/resources/examples/aot_inductor/bert_handler/model-config.yaml b/cpp/test/resources/examples/aot_inductor/bert_handler/model-config.yaml
@@ -0,0 +1,13 @@
+minWorkers: 1
+maxWorkers: 1
+batchSize: 2
+
+handler:
+  model_so_path: "bert-seq.so"
+  tokenizer_path: "tokenizer.json"
+  mapping: "index_to_name.json"
+  model_name: "bert-base-uncased"
+  mode: "sequence_classification"
+  do_lower_case: true
+  num_labels: 2
+  max_length: 150
diff --git a/cpp/test/resources/examples/aot_inductor/bert_handler/sample_text.txt b/cpp/test/resources/examples/aot_inductor/bert_handler/sample_text.txt
@@ -0,0 +1 @@
+Bloomberg has decided to publish a new report on the global economy.
diff --git a/examples/cpp/aot_inductor/bert/CMakeLists.txt b/examples/cpp/aot_inductor/bert/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TOKENZIER_CPP_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../../../cpp/third-party/tokenizers-cpp)
+add_subdirectory(${TOKENZIER_CPP_PATH} tokenizers EXCLUDE_FROM_ALL)
+add_library(bert_handler SHARED src/bert_handler.cc)
+target_include_directories(bert_handler PRIVATE ${TOKENZIER_CPP_PATH}/include)
+target_link_libraries(bert_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES} tokenizers_cpp)
diff --git a/examples/cpp/aot_inductor/bert/README.md b/examples/cpp/aot_inductor/bert/README.md
@@ -0,0 +1,60 @@
+This example uses AOTInductor to compile the [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) into an so file which is then executed using libtorch.
+The handler C++ source code for this examples can be found [here](src).
+
+### Setup
+1. Follow the instructions in [README.md](../../../../cpp/README.md) to build the TorchServe C++ backend.
+
+```
+cd serve/cpp
+./builld.sh
+```
+
+The build script will create the necessary artifact for this example.
+To recreate these by hand you can follow the prepare_test_files function of the [build.sh](../../../../cpp/build.sh) script.
+We will need the handler .so file as well as the bert-seq.so and tokenizer.json.
+
+2. Create a [model-config.yaml](model-config.yaml)
+
+```yaml
+minWorkers: 1
+maxWorkers: 1
+batchSize: 2
+
+handler:
+  model_so_path: "bert-seq.so"
+  tokenizer_path: "tokenizer.json"
+  mapping: "index_to_name.json"
+  model_name: "bert-base-uncased"
+  mode: "sequence_classification"
+  do_lower_case: true
+  num_labels: 2
+  max_length: 150
+```
+
+### Generate Model Artifact Folder
+
+```bash
+torch-model-archiver --model-name bertcppaot --version 1.0 --handler ../../../../cpp/_build/test/resources/examples/aot_inductor/bert_handler/libbert_handler:BertCppHandler --runtime LSP --extra-files index_to_name.json,../../../../cpp/_build/test/resources/examples/aot_inductor/bert_handler/bert-seq.so,../../../../cpp/_build/test/resources/examples/aot_inductor/bert_handler/tokenizer.json  --config-file model-config.yaml --archive-format no-archive
+```
+
+Create model store directory and move the folder `bertcppaot`
+
+```
+mkdir model_store
+mv bertcppaot model_store/
+```
+
+### Inference
+
+Start torchserve using the following command
+
+```
+torchserve --ncs --model-store model_store/ --models bertcppaot
+```
+
+Infer the model using the following command
+
+```
+curl http://localhost:8080/predictions/bertcppaot -T ../../../../cpp/test/resources/examples/aot_inductor/bert_handler/sample_text.txt
+Not Accepted
+```
diff --git a/examples/cpp/aot_inductor/bert/aot_compile_export.py b/examples/cpp/aot_inductor/bert/aot_compile_export.py
@@ -0,0 +1,121 @@
+import os
+import sys
+
+import torch
+import yaml
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    set_seed,
+)
+
+set_seed(1)
+# PT2.2 has limitation on the max
+MAX_BATCH_SIZE = 15
+MAX_LENGTH = 511
+
+
+def transformers_model_dowloader(
+    mode,
+    pretrained_model_name,
+    num_labels,
+    do_lower_case,
+    max_length,
+    batch_size,
+):
+    print("Download model and tokenizer", pretrained_model_name)
+    # loading pre-trained model and tokenizer
+    if mode == "sequence_classification":
+        config = AutoConfig.from_pretrained(
+            pretrained_model_name,
+            num_labels=num_labels,
+            torchscript=False,
+            return_dict=False,
+        )
+        model = AutoModelForSequenceClassification.from_pretrained(
+            pretrained_model_name, config=config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name, do_lower_case=do_lower_case
+        )
+    else:
+        sys.exit(f"mode={mode} has not been implemented in this cpp example yet.")
+
+    NEW_DIR = "./Transformer_model"
+    try:
+        os.mkdir(NEW_DIR)
+    except OSError:
+        print("Creation of directory %s failed" % NEW_DIR)
+    else:
+        print("Successfully created directory %s " % NEW_DIR)
+
+    print(
+        "Save model and tokenizer model based on the setting from setup_config",
+        pretrained_model_name,
+        "in directory",
+        NEW_DIR,
+    )
+
+    model.save_pretrained(NEW_DIR)
+    tokenizer.save_pretrained(NEW_DIR)
+
+    with torch.no_grad():
+        model.eval()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        model = model.to(device=device)
+        dummy_input = "This is a dummy input for torch jit trace"
+        inputs = tokenizer.encode_plus(
+            dummy_input,
+            max_length=max_length,
+            padding=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+        attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+            device
+        )
+        batch_dim = torch.export.Dim("batch", min=1, max=MAX_BATCH_SIZE)
+        seq_len_dim = torch.export.Dim("seq_len", min=1, max=MAX_LENGTH)
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+        model_so_path = torch._export.aot_compile(
+            model,
+            (input_ids, attention_mask),
+            dynamic_shapes={
+                "input_ids": (batch_dim, seq_len_dim),
+                "attention_mask": (batch_dim, seq_len_dim),
+            },
+            options={
+                "aot_inductor.output_path": os.path.join(os.getcwd(), "bert-seq.so"),
+                "max_autotune": True,
+            },
+        )
+
+    return
+
+
+if __name__ == "__main__":
+    dirname = os.path.dirname(__file__)
+    if len(sys.argv) > 1:
+        filename = os.path.join(dirname, sys.argv[1])
+    else:
+        filename = os.path.join(dirname, "model-config.yaml")
+    with open(filename, "r") as f:
+        settings = yaml.safe_load(f)
+
+    mode = settings["handler"]["mode"]
+    model_name = settings["handler"]["model_name"]
+    num_labels = int(settings["handler"]["num_labels"])
+    do_lower_case = bool(settings["handler"]["do_lower_case"])
+    max_length = int(settings["handler"]["max_length"])
+    batch_size = int(settings["batchSize"])
+    transformers_model_dowloader(
+        mode,
+        model_name,
+        num_labels,
+        do_lower_case,
+        max_length,
+        batch_size,
+    )
diff --git a/examples/cpp/aot_inductor/bert/index_to_name.json b/examples/cpp/aot_inductor/bert/index_to_name.json
@@ -0,0 +1,4 @@
+{
+ "0":"Not Accepted",
+ "1":"Accepted"
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Bloomberg has decided to publish a new report on the global economy.