Merge pull request #1209 from TeslaZhao/develop

Update Pipeline benchmark & trace logs & OCR Examples
PaddlePaddle · May 11, 2021 · c42a5ad · c42a5ad
2 parents 68a43e5 + 8b43a81
commit c42a5ad
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 74 deletions.
diff --git a/python/examples/pipeline/ocr/benchmark.py b/python/examples/pipeline/ocr/benchmark.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import sys
 import os
 import base64
@@ -12,6 +26,8 @@
 import numpy as np
 from paddle_serving_client.utils import MultiThreadRunner
 from paddle_serving_client.utils import benchmark_args, show_latency
+
+
 def parse_benchmark(filein, fileout):
     with open(filein, "r") as fin:
         res = yaml.load(fin)
@@ -24,6 +40,7 @@ def parse_benchmark(filein, fileout):
     with open(fileout, "w") as fout:
         yaml.dump(res, fout, default_flow_style=False)
 
+
 def gen_yml(device):
     fin = open("config.yml", "r")
     config = yaml.load(fin)
@@ -33,61 +50,109 @@ def gen_yml(device):
         config["op"]["det"]["local_service_conf"]["device_type"] = 1
         config["op"]["det"]["local_service_conf"]["devices"] = "2"
         config["op"]["rec"]["local_service_conf"]["device_type"] = 1
-        config["op"]["rec"]["local_service_conf"]["devices"] = "2"        
-    with open("config2.yml", "w") as fout: 
+        config["op"]["rec"]["local_service_conf"]["devices"] = "2"
+    with open("config2.yml", "w") as fout:
         yaml.dump(config, fout, default_flow_style=False)
 
+
 def cv2_to_base64(image):
     return base64.b64encode(image).decode('utf8')
 
+
 def run_http(idx, batch_size):
     print("start thread ({})".format(idx))
-    url = "http://127.0.0.1:9999/ocr/prediction"    
+    url = "http://127.0.0.1:9999/ocr/prediction"
     start = time.time()
-
     test_img_dir = "imgs/"
+    #test_img_dir = "rctw_test/images/"
+    latency_list = []
+    total_number = 0
     for img_file in os.listdir(test_img_dir):
+        l_start = time.time()
         with open(os.path.join(test_img_dir, img_file), 'rb') as file:
             image_data1 = file.read()
         image = cv2_to_base64(image_data1)
         data = {"key": ["image"], "value": [image]}
-        for i in range(100):
-            r = requests.post(url=url, data=json.dumps(data))
+        #for i in range(100):
+        r = requests.post(url=url, data=json.dumps(data))
+        print(r.json())
         end = time.time()
-    return [[end - start]]
+        l_end = time.time()
+        latency_list.append(l_end * 1000 - l_start * 1000)
+        total_number = total_number + 1
+    return [[end - start], latency_list, [total_number]]
+
 
 def multithread_http(thread, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_http , thread, batch_size)
+    start = time.time()
+    result = multi_thread_runner.run(run_http, thread, batch_size)
+    end = time.time()
+    total_cost = end - start
+    avg_cost = 0
+    total_number = 0
+    for i in range(thread):
+        avg_cost += result[0][i]
+        total_number += result[2][i]
+    avg_cost = avg_cost / thread
+    print("Total cost: {}s".format(total_cost))
+    print("Each thread cost: {}s. ".format(avg_cost))
+    print("Total count: {}. ".format(total_number))
+    print("AVG QPS: {} samples/s".format(batch_size * total_number /
+                                         total_cost))
+    show_latency(result[1])
+
 
 def run_rpc(thread, batch_size):
     client = PipelineClient()
     client.connect(['127.0.0.1:18090'])
     start = time.time()
     test_img_dir = "imgs/"
+    #test_img_dir = "rctw_test/images/"
+    latency_list = []
+    total_number = 0
     for img_file in os.listdir(test_img_dir):
+        l_start = time.time()
         with open(os.path.join(test_img_dir, img_file), 'rb') as file:
             image_data = file.read()
         image = cv2_to_base64(image_data)
-
-        for i in range(100):
-            ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+        ret = client.predict(feed_dict={"image": image}, fetch=["res"])
+        print(ret)
+        l_end = time.time()
+        latency_list.append(l_end * 1000 - l_start * 1000)
+        total_number = total_number + 1
     end = time.time()
-    return [[end - start]]
+    return [[end - start], latency_list, [total_number]]
 
 
 def multithread_rpc(thraed, batch_size):
     multi_thread_runner = MultiThreadRunner()
-    result = multi_thread_runner.run(run_rpc , thread, batch_size)
+    start = time.time()
+    result = multi_thread_runner.run(run_rpc, thread, batch_size)
+    end = time.time()
+    total_cost = end - start
+    avg_cost = 0
+    total_number = 0
+    for i in range(thread):
+        avg_cost += result[0][i]
+        total_number += result[2][i]
+    avg_cost = avg_cost / thread
+    print("Total cost: {}s".format(total_cost))
+    print("Each thread cost: {}s. ".format(avg_cost))
+    print("Total count: {}. ".format(total_number))
+    print("AVG QPS: {} samples/s".format(batch_size * total_number /
+                                         total_cost))
+    show_latency(result[1])
+
 
 if __name__ == "__main__":
     if sys.argv[1] == "yaml":
-        mode = sys.argv[2] # brpc/  local predictor
+        mode = sys.argv[2]  # brpc/  local predictor
         thread = int(sys.argv[3])
         device = sys.argv[4]
         gen_yml(device)
     elif sys.argv[1] == "run":
-        mode = sys.argv[2] # http/ rpc
+        mode = sys.argv[2]  # http/ rpc
         thread = int(sys.argv[3])
         batch_size = int(sys.argv[4])
         if mode == "http":
@@ -98,4 +163,3 @@ def multithread_rpc(thraed, batch_size):
         filein = sys.argv[2]
         fileout = sys.argv[3]
         parse_benchmark(filein, fileout)
-
diff --git a/python/examples/pipeline/ocr/benchmark.sh b/python/examples/pipeline/ocr/benchmark.sh
@@ -1,59 +1,88 @@
 export FLAGS_profile_pipeline=1
-alias python3="python3.7"
+alias python3="python3.6"
 modelname="ocr"
+
 # HTTP
-ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
+#ps -ef | grep web_service | awk '{print $2}' | xargs kill -9 
 sleep 3
-python3 benchmark.py yaml local_predictor 1 gpu 
+# Create yaml，If you already have the config.yaml, ignore it. 
+#python3 benchmark.py yaml local_predictor 1 gpu 
 rm -rf profile_log_$modelname
-for thread_num in 1 8 16
+
+echo "Starting HTTP Clients..."
+# Start a client in each thread, tesing the case of multiple threads.
+for thread_num in 1 2 4 8 12 16
 do
   for batch_size in 1
   do
-    echo "----Bert thread num: $thread_num batch size: $batch_size mode:http ----" >>profile_log_$modelname
-    rm -rf PipelineServingLogs
-    rm -rf cpu_utilization.py
-    python3 web_service.py >web.log 2>&1 &
-    sleep 3
-    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
-    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo '----$modelname thread num: $thread_num batch size: $batch_size mode:http ----' >>profile_log_$modelname
+    # Start one web service, If you start the service yourself, you can ignore it here.
+    #python3 web_service.py >web.log 2>&1 &
+    #sleep 3
+
+    # --id is the serial number of the GPU card, Must be the same as the gpu id used by the server. 
+    nvidia-smi --id=3 --query-gpu=memory.used --format=csv -lms 1000 > gpu_use.log 2>&1 &
+    nvidia-smi --id=3 --query-gpu=utilization.gpu --format=csv -lms 1000 > gpu_utilization.log 2>&1 &
     echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
-    python3 benchmark.py run http $thread_num $batch_size
-    python3 cpu_utilization.py >>profile_log_$modelname
-    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
-    python3 benchmark.py dump benchmark.log benchmark.tmp
-    mv benchmark.tmp benchmark.log
+    # Start http client
+    python3 benchmark.py run http $thread_num $batch_size > profile 2>&1
+
+    # Collect CPU metrics, Filter data that is zero momentarily, Record the maximum value of GPU memory and the average value of GPU utilization
+    python3 cpu_utilization.py >> profile_log_$modelname
+    grep -av '^0 %' gpu_utilization.log > gpu_utilization.log.tmp
     awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
-    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
-    cat benchmark.log >> profile_log_$modelname
-    #rm -rf gpu_use.log gpu_utilization.log
+    awk -F' ' '{sum+=$1} END {print "GPU_UTILIZATION:", sum/NR, sum, NR }' gpu_utilization.log.tmp >> profile_log_$modelname
+
+    # Show profiles
+    python3 ../../util/show_profile.py profile $thread_num >> profile_log_$modelname
+    tail -n 8 profile >> profile_log_$modelname
+    echo '' >> profile_log_$modelname
   done
 done
+
+# Kill all nvidia-smi background task.
+pkill nvidia-smi
+
+echo "Starting RPC Clients..."
+
 # RPC
-ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
+#ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
 sleep 3
-python3 benchmark.py yaml local_predictor 1 gpu
 
-for thread_num in 1 8 16
+# Create yaml，If you already have the config.yaml, ignore it.
+#python3 benchmark.py yaml local_predictor 1 gpu
+rm -rf profile_log_$modelname
+
+# Start a client in each thread, tesing the case of multiple threads.
+for thread_num in 1 2 4 6 8 12 16
 do
   for batch_size in 1
   do
-    echo "----Bert thread num: $thread_num batch size: $batch_size mode:rpc ----" >>profile_log_$modelname
-    rm -rf PipelineServingLogs
-    rm -rf cpu_utilization.py
-    python3 web_service.py >web.log 2>&1 &
-    sleep 3
-    nvidia-smi --id=2 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
-    nvidia-smi --id=2 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
+    echo "----$modelname thread num: $thread_num batch size: $batch_size mode:rpc ----" >> profile_log_$modelname
+    # Start one web service, If you start the service yourself, you can ignore it here.
+    #python3 web_service.py >web.log 2>&1 &
+    #sleep 3
+
+    # --id is the serial number of the GPU card, Must be the same as the gpu id used by the server.
+    nvidia-smi --id=3 --query-compute-apps=used_memory --format=csv -lms 100 > gpu_use.log 2>&1 &
+    nvidia-smi --id=3 --query-gpu=utilization.gpu --format=csv -lms 100 > gpu_utilization.log 2>&1 &
     echo "import psutil\ncpu_utilization=psutil.cpu_percent(1,False)\nprint('CPU_UTILIZATION:', cpu_utilization)\n" > cpu_utilization.py
-    python3 benchmark.py run rpc $thread_num $batch_size
-    python3 cpu_utilization.py >>profile_log_$modelname
-    ps -ef | grep web_service | awk '{print $2}' | xargs kill -9
-    python3 benchmark.py dump benchmark.log benchmark.tmp
-    mv benchmark.tmp benchmark.log
+
+    # Start http client
+    python3 benchmark.py run rpc $thread_num $batch_size > profile 2>&1
+
+    # Collect CPU metrics, Filter data that is zero momentarily, Record the maximum value of GPU memory and the average value of GPU utilization
+    python3 cpu_utilization.py >> profile_log_$modelname
+    grep -av '^0 %' gpu_utilization.log > gpu_utilization.log.tmp
     awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "MAX_GPU_MEMORY:", max}' gpu_use.log >> profile_log_$modelname
-    awk 'BEGIN {max = 0} {if(NR>1){if ($modelname > max) max=$modelname}} END {print "GPU_UTILIZATION:", max}' gpu_utilization.log >> profile_log_$modelname
-    #rm -rf gpu_use.log gpu_utilization.log
-    cat benchmark.log >> profile_log_$modelname
+    awk -F" " '{sum+=$1} END {print "GPU_UTILIZATION:", sum/NR, sum, NR }' gpu_utilization.log.tmp >> profile_log_$modelname
+
+    # Show profiles
+    python3 ../../util/show_profile.py profile $thread_num >> profile_log_$modelname
+    tail -n 8 profile >> profile_log_$modelname
+    echo "" >> profile_log_$modelname
   done
 done
+
+# Kill all nvidia-smi background task.
+pkill nvidia-smi
diff --git a/python/examples/pipeline/ocr/web_service.py b/python/examples/pipeline/ocr/web_service.py
@@ -45,6 +45,7 @@ def preprocess(self, input_dicts, data_id, log_id):
         imgs = []
         for key in input_dict.keys():
             data = base64.b64decode(input_dict[key].encode('utf8'))
+            self.raw_im = data
             data = np.frombuffer(data, np.uint8)
             self.im = cv2.imdecode(data, cv2.IMREAD_COLOR)
             self.ori_h, self.ori_w, _ = self.im.shape
@@ -61,7 +62,7 @@ def postprocess(self, input_dicts, fetch_dict, log_id):
         ]
         dt_boxes_list = self.post_func(det_out, [ratio_list])
         dt_boxes = self.filter_func(dt_boxes_list[0], [self.ori_h, self.ori_w])
-        out_dict = {"dt_boxes": dt_boxes, "image": self.im}
+        out_dict = {"dt_boxes": dt_boxes, "image": self.raw_im}
         return out_dict, None, ""
 
 
@@ -73,7 +74,9 @@ def init_op(self):
 
     def preprocess(self, input_dicts, data_id, log_id):
         (_, input_dict), = input_dicts.items()
-        im = input_dict["image"]
+        raw_im = input_dict["image"]
+        data = np.frombuffer(raw_im, np.uint8)
+        im = cv2.imdecode(data, cv2.IMREAD_COLOR)
         dt_boxes = input_dict["dt_boxes"]
         dt_boxes = self.sorted_boxes(dt_boxes)
         feed_list = []
@@ -99,7 +102,7 @@ def preprocess(self, input_dicts, data_id, log_id):
         """
 
         ## Many mini-batchs, the type of feed_data is list.
-        max_batch_size = 6  # len(dt_boxes)
+        max_batch_size = len(dt_boxes)
 
         # If max_batch_size is 0, skipping predict stage
         if max_batch_size == 0: