qingshui · qingshui · Nov 28, 2023 · Aug 14, 2023 · Aug 14, 2023 · Sep 5, 2023
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -28,7 +28,8 @@ elseif(NEW_RELEASE_JIT)
 else()
   set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
-  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
+  #set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "70 80")
 endif()
 
 ######################################################################################
@@ -161,7 +162,7 @@ function(select_nvcc_arch_flags out_variable)
     if(WITH_NV_JETSON)
       set(cuda_arch_bin "72")
     else()
-      set(cuda_arch_bin "70")
+      set(cuda_arch_bin "70 80")
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -41,6 +41,13 @@ endif()
 proto_library(framework_proto SRCS framework.proto)
 proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto)
 
+if(WITH_GPU)
+  nv_library(
+    trie_manager
+    SRCS trie_manager.cc trie_manager.cu trie.cc
+    DEPS tensor device_context math_function)
+endif()
+
 proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto)
 cc_library(
   op_def_api

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1420,6 +1420,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // result of HasAttr.
   if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext))
     enable_cache_runtime_context_ = true;
+  if (this->Type() == "fused_multi_transformer_int8" || this->Type() == "fused_multi_transformer_moe_int8")
+    enable_cache_runtime_context_ = true;
   if (!all_kernels_must_compute_runtime_shape_ &&
       HasAttr(kAllKernelsMustComputeRuntimeShape))
     all_kernels_must_compute_runtime_shape_ = true;

diff --git a/paddle/fluid/framework/trie.cc b/paddle/fluid/framework/trie.cc
@@ -0,0 +1,160 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <sstream>
+
+#include "trie.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace framework {
+#define ENFORCE PADDLE_ENFORCE
+
+// Trie
+int Trie::load(const std::string& dir, const uint32_t thr_num) {
+    std::string list_file = dir + "/file_list";
+    std::ifstream ifs(list_file.c_str());
+    if (!ifs.is_open()) {
+        printf("open file %s failed\n", list_file.c_str());
+        return -1;
+    }
+
+    std::vector<File> files;
+    std::string line;
+    uint32_t node_num = 0;
+    while (getline(ifs, line)) {
+        std::stringstream ss(line);
+        File file;
+        ss >> file.filename;
+        ss >> file.node_num;
+
+        file.filename = dir + "/" + file.filename; 
+        file.node_off = node_num;
+        node_num += file.node_num;
+        files.emplace_back(std::move(file));
+    }
+    printf("total file_num: %zu, node_num: %u\n", files.size(), node_num);
+
+    {
+        ScopedNanoTimer t("Trie stat");
+        parallel_run_range(files.size(), thr_num,
+                [this, &files](uint32_t thr_id, uint32_t start, uint32_t end) {
+            for (uint32_t i = start; i < end; ++i) {
+                stat_file(thr_id, files.at(i));
+            }
+        });
+    }
+
+    Node root;
+    for (auto& file: files) {
+        root.child.insert(root.child.end(), file.root.begin(), file.root.end());
+    }
+
+    {
+        ScopedNanoTimer t("Trie resize");
+        resize(node_num + 1); // +1 for root
+
+        size_t off = root.child.size();
+        for (size_t i = 0; i < files.size(); ++i) {
+            mem_off(files[i].node_off + 1) = off; //+1 for root
+            ENFORCE(files[i].node_num >= files[i].root.size());
+            off += files[i].node_num - files[i].root.size();
+        }
+        ENFORCE(off == node_num);
+    }
+
+    {
+        ScopedNanoTimer t("Trie load");
+        parallel_run_range(files.size(), thr_num,
+                [this, &files](uint32_t thr_id, uint32_t start, uint32_t end) {
+            for (size_t i = start; i < end; ++i) {
+                load_file(thr_id, files.at(i));
+            }
+        });
+    }
+
+    link(root);
+
+    return 0;
+}
+
+void Trie::parse(std::string& line, Node& node, uint32_t off) {
+    node.child.clear();
+
+    char* str = const_cast<char*>(line.c_str());
+    char* endstr = nullptr;
+    size_t len = 0;
+
+    node.id = std::strtoul(str, &endstr, 10) + off;
+    str = endstr;
+    ENFORCE(*str == '\t');
+    ++str;
+
+    node.label = std::strtoul(str, &endstr, 10);
+    str = endstr;
+    ENFORCE(*str == '\t');
+    ++str;
+
+    len = std::strtoul(str, &endstr, 10);
+    str = endstr;
+    for (size_t k = 0; k < len; ++k) {
+        node.child.push_back(std::strtoul(str, &endstr, 10) + off);
+        ENFORCE(str != endstr);
+        str = endstr;
+        ++str;
+    }
+
+    node.aleaf = std::strtoul(str, &endstr, 10);
+    str = endstr;
+    ENFORCE(*str == '\0');
+}
+
+void Trie::stat_file(uint32_t thr_id, File& file) {
+    printf("stat file %s\n", file.filename.c_str());
+    Node node;
+
+    std::ifstream ifs(file.filename.c_str());
+    ENFORCE(ifs.is_open(), "open file %s failed\n", file.filename.c_str());
+
+    std::string line;
+    getline(ifs, line);
+
+    parse(line, node, file.node_off);
+    file.root = std::move(node.child);
+}
+
+void Trie::load_file(uint32_t thr_id, File& file) {
+    printf("load file %s\n", file.filename.c_str());
+
+    std::ifstream ifs(file.filename.c_str());
+    ENFORCE(ifs.is_open(), "open file %s failed\n", file.filename.c_str());
+
+    Node node;
+    std::string line;
+    // don't link root
+    if (getline(ifs, line)) {
+        parse(line, node, file.node_off);
+        file.root = std::move(node.child);
+    }
+
+    while(getline(ifs, line)) {
+        parse(line, node, file.node_off);
+        link(node);
+    }
+}
+
+#undef ENFORCE
+}  // end namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/trie.h b/paddle/fluid/framework/trie.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+#include <chrono>
+#include <thread>
+#include <queue>
+
+namespace paddle {
+namespace framework {
+
+template <class T>
+void parallel_run_range(uint32_t n, uint32_t thr_num, T&& func) {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < thr_num; ++i) {
+        threads.emplace_back(std::thread([i, n, thr_num, &func]() {
+            func(i, n * i / thr_num, n * (i + 1) / thr_num);
+        }));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+}
+
+class ScopedNanoTimer {
+public:
+    ScopedNanoTimer(const std::string& n) : t0(std::chrono::high_resolution_clock::now()), m(n) {
+    };
+    ~ScopedNanoTimer() {
+        auto t1 = std::chrono::high_resolution_clock::now();
+        auto nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
+        printf("%s cost %fs\n", m.c_str(), nanos/1000000000.0);
+    }
+
+protected:
+    std::chrono::high_resolution_clock::time_point t0;
+    std::string m;
+};
+
+class Trie {
+struct File {
+    std::string filename;
+    std::vector<uint32_t> root;
+    uint32_t node_num = 0;
+    uint32_t node_off = 0;
+};
+
+struct Node {
+    uint32_t id = 0;
+    uint16_t label = 0;
+    std::vector<uint32_t> child;
+    uint8_t aleaf = 0;
+};
+
+public:
+    Trie() {}
+    virtual ~Trie() {}
+    int load(const std::string& dir, const uint32_t thr_num=20u);
+
+    uint16_t label(uint32_t id) {
+        return label_.at(id);
+    }
+
+    uint8_t aleaf(uint32_t id) {
+        return aleaf_.at(id);
+    }
+
+    void child(uint32_t id, std::vector<uint32_t>& child) {
+        child.clear();
+        size_t s = mem_off(id);
+        size_t e = mem_off(id + 1);
+        for (size_t i = s; i < e; ++i) {
+            child.push_back(child_mem_.at(i));
+        }
+    }
+
+    size_t child_size(uint32_t id) {
+        size_t s = mem_off(id);
+        size_t e = mem_off(id + 1);
+
+        return e - s;
+    }
+
+    size_t child_at(uint32_t id, size_t i) {
+        size_t s = mem_off(id);
+
+        return child_mem_.at(s + i);
+    }
+
+    void print() {
+        // level order traversal
+        std::queue<uint32_t> q;
+        q.push(0);
+        std::vector<uint32_t> child;
+
+        while(!q.empty()) {
+            size_t len = q.size();
+            for (size_t i = 0; i < len; ++i) {
+                uint32_t id = q.front();
+                q.pop();
+
+                printf("[#%u,%u,%u,<", id, label(id), aleaf(id));
+                this->child(id, child);
+                for (auto j : child) {
+                    q.push(j);
+                    printf("#%u,", j);
+                }
+                printf(">] ");
+            }
+            printf("\n");
+        }
+    }
+
+protected:
+    void resize(uint32_t node_num) {
+        label_.resize(node_num);
+        aleaf_.resize(node_num);
+        child_mem_.resize(node_num);
+        mem_off_.resize(node_num + 1, 0);
+    }
+
+    uint32_t& mem_off(uint32_t id) {
+        return mem_off_.at(id);
+    }
+
+    void link(const Node& node) {
+        label_.at(node.id) = node.label;
+        aleaf_.at(node.id) = node.aleaf;
+
+        uint32_t addr = mem_off(node.id);
+        for (size_t i = 0; i < node.child.size(); ++i) {
+            child_mem_.at(addr++) = node.child[i];
+        }
+        if (mem_off(node.id + 1) == 0) {
+            mem_off(node.id + 1) = addr;
+        }
+    }
+
+    void parse(std::string& line, Node& node, uint32_t off=0);
+    void load_file(uint32_t thr_id, File& file);
+    void stat_file(uint32_t thr_id, File& file);
+
+    std::vector<uint16_t> label_;
+    std::vector<uint8_t>  aleaf_;
+    std::vector<uint32_t> child_mem_;
+    std::vector<uint32_t> mem_off_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle