parent 7eb6443

author Louis J <ljean@etud.insa-toulouse.fr> 1563984477 +0200 committer Guillaume Infantes <guillaume.infantes@jolibrain.com> 1576060297 +0100 parent 7eb6443 author Louis J <ljean@etud.insa-toulouse.fr> 1563984477 +0200 committer Guillaume Infantes <guillaume.infantes@jolibrain.com> 1576059845 +0100 LOUISJ'S COMMITS: Move dataset management and model building in separate classes Add train and test The fix on txtinputconnector is temporary, vocab generation should be fixed a more robust way BERT finetuning with custom number of classes Add self supervised Masked LM learning Save solver checkpoint along with model Ensure label is of correct dimension Fix masked_lm, add more explicit error message Add script to trace huggingface models Add classfication on hidden states to be able to use masked lm model for classif Better API, more features, less memory usage and fix bugs Add unit tests for training Move training parameters to solver and net Add comments Download tar from deepdetect.com torch 1.3.1 alone working with caffe patch correction: add pcaffe/logging.h force -j8 when building libtorch (default is -j nproc) points to model traced for torch 131 GUILLAUME COMMITS: changes for torch 131 Move dataset management and model building in separate classes Add train and test The fix on txtinputconnector is temporary, vocab generation should be fixed a more robust way BERT finetuning with custom number of classes Add self supervised Masked LM learning Save solver checkpoint along with model Ensure label is of correct dimension Better API, more features, less memory usage and fix bugs Move training parameters to solver and net Add comments Add inference support for GPT2 Make lower case optional Add gpt2 training Add gpt2 demo rebase all glitches in merge update to last transformers from hugginface gpt2 inference ok sanitize width vs sequence remove comment in cmakelist
jolibrain · Dec 12, 2019 · fb2e2d4 · fb2e2d4
1 parent 7eb6443
commit fb2e2d4
Show file tree

Hide file tree

Showing 17 changed files with 2,288 additions and 143 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -706,7 +706,8 @@ if (USE_TORCH)
   set(PYTORCH_PATCHES_PATH ${CMAKE_BINARY_DIR}/patches/pytorch)
 
   set(PYTORCH_PATCHES
-    ${PYTORCH_PATCHES_PATH}/pytorch_log.patch
+    ${PYTORCH_PATCHES_PATH}/pytorch_compile.patch
+    ${PYTORCH_PATCHES_PATH}/pytorch_logging.patch
     )
 
   include_directories("${PROTOBUF_INCLUDE_DIR}")
@@ -715,7 +716,8 @@ if (USE_TORCH)
   add_definitions(-DUSE_TORCH)
 
   if (NOT TORCH_LOCATION)
-    set(PYTORCH_COMMIT 0b868b19063645afed59d6d49aff1e43d1665b88)
+    # below version 1.3.1
+    set(PYTORCH_COMMIT ee77ccbb6da4e2efd83673e798acf7081bc03564)
     set(PYTORCH_COMPLETE ${CMAKE_BINARY_DIR}/CMakeFiles/pytorch-complete)
 
     if(NOT USE_CPU_ONLY AND CUDA_FOUND)
@@ -734,7 +736,7 @@ if (USE_TORCH)
       PATCH_COMMAND test -f ${PYTORCH_COMPLETE} && echo Skipping || git apply ${PYTORCH_PATCHES} && echo Applying ${PYTORCH_PATCHES}
       CONFIGURE_COMMAND ""
       BUILD_COMMAND ""
-      COMMAND test -f ${PYTORCH_COMPLETE} && echo Skipping || GLIBCXX_USE_CXX11_ABI=1 BUILD_TEST=0 USE_CUDA=${PYTORCH_USE_CUDA} python3 ../pytorch/tools/build_libtorch.py
+      COMMAND test -f ${PYTORCH_COMPLETE} && echo Skipping || GLIBCXX_USE_CXX11_ABI=1 BUILD_TEST=0 USE_CUDA=${PYTORCH_USE_CUDA} CAFFE2_LINK_LOCAL_PROTOBUF=0 MAX_JOBS=8 python3 ../pytorch/tools/build_libtorch.py
       INSTALL_COMMAND ""
     )
 
@@ -747,10 +749,10 @@ if (USE_TORCH)
   #  )
   #message(STATUS "Libraries are: ${TORCH_LIBRARIES}")
 
-  set(TORCH_LIB_DEPS torch caffe2 ${TORCH_LOCATION}/lib/libc10.so)
+  set(TORCH_LIB_DEPS torch ${TORCH_LOCATION}/lib/libc10.so)
 
   if (NOT USE_CPU_ONLY AND CUDA_FOUND)
-    list(APPEND TORCH_LIB_DEPS caffe2_gpu ${TORCH_LOCATION}/lib/libc10_cuda.so iomp5)
+    list(APPEND TORCH_LIB_DEPS  ${TORCH_LOCATION}/lib/libc10_cuda.so iomp5)
   else()
     list(APPEND TORCH_LIB_DEPS iomp5)
   endif()

diff --git a/demo/gpt2/dd_client.py b/demo/gpt2/dd_client.py
@@ -0,0 +1 @@
+../../clients/python/dd_client.py
diff --git a/demo/gpt2/run_gpt2.py b/demo/gpt2/run_gpt2.py
@@ -0,0 +1,75 @@
+import random
+import sys
+import argparse
+from dd_client import DD
+
+parser = argparse.ArgumentParser(description="Use DeepDetect and GPT-2 to generate text")
+parser.add_argument("-r", "--repository", required=True, help="Model repository")
+parser.add_argument("--host", type=str, default="localhost")
+parser.add_argument("--port", type=int, default=8080)
+parser.add_argument("--cpu", action='store_true', help="Force model to run on CPU")
+parser.add_argument("--input-size", type=int, default=512)
+parser.add_argument("--topk", type=int, default=5, help="How many top predictions should be considered to chose the next token.")
+parser.add_argument("--temperature", type=float, default=1, help="Temperature of the predictions. The higher, the 'randomer'.")
+
+args = parser.parse_args()
+
+# dd global variables
+sname = 'gpt-2'
+description = 'Inference with GPT-2'
+mllib = 'torch'
+
+dd = DD(args.host, args.port)
+dd.set_return_format(dd.RETURN_PYTHON)
+
+# setting up the ML service
+model = {'repository':args.repository}
+parameters_input = {
+    'connector':'txt',
+    'ordered_words': True,
+    'wordpiece_tokens': True,
+    'punctuation_tokens': True,
+    'lower_case': False,
+    'width': args.input_size
+}
+parameters_mllib = {'template':'gpt2', 'gpu':True}
+parameters_output = {}
+dd.put_service(sname,model,description,mllib,
+               parameters_input,parameters_mllib,parameters_output)
+
+# generating text
+prompt = input("Enter beggining of sentence >>> ")
+
+for i in range(0, 256):
+    data = [prompt]
+    parameters_input = {'word_start': "Ġ", 'suffix_start': ""}
+    parameters_mllib = {}
+    parameters_output = {'best':args.topk}
+    result = dd.post_predict(sname, data, parameters_input,parameters_mllib,parameters_output)
+
+    # Select result from the returned tokens
+    word_probs = list()
+    total_probs = 0
+
+    for cls in result['body']['predictions'][0]['classes']:
+        word = cls['cat'].replace("Ġ", " ")
+        # dede does not support \n character well, so we don't select tokens containing a new line
+        if 'Ċ' in word:
+            continue
+
+        prob = pow(cls['prob'], args.temperature)
+        total_probs += prob
+        word_probs.append((word, prob))
+
+    selector = random.uniform(0, total_probs)
+    total_probs = 0
+
+    for word, prob in word_probs:
+        total_probs += prob
+        if total_probs > selector:
+            selected_word = word
+            break
+
+    print(selected_word, sep='', end='')
+    sys.stdout.flush()
+    prompt += selected_word
diff --git a/patches/pytorch/pytorch_compile.patch b/patches/pytorch/pytorch_compile.patch
@@ -0,0 +1,12 @@
+diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
+index 894559ed43..7887147a28 100644
+--- a/tools/setup_helpers/cmake.py
++++ b/tools/setup_helpers/cmake.py
+@@ -229,6 +229,7 @@ class CMake:
+              'CUDA_NVCC_EXECUTABLE',
+              'CUDNN_LIBRARY',
+              'CUDNN_INCLUDE_DIR',
++             'CAFFE2_LINK_LOCAL_PROTOBUF',
+              'EXPERIMENTAL_SINGLE_THREAD_POOL',
+              'INSTALL_TEST',
+              'MKL_THREADING',