From 72f49512b175420b5f67cd1e98c78bbd065abf17 Mon Sep 17 00:00:00 2001
From: bestwoody <89765764+bestwoody@users.noreply.github.com>
Date: Thu, 1 Sep 2022 04:26:24 +0800
Subject: [PATCH] Improve memtrack (#5610)

close pingcap/tiflash#5609
---
 dbms/src/Common/BackgroundTask.cpp            |  94 ++++++++
 dbms/src/Common/BackgroundTask.h              |  39 +++
 dbms/src/Common/MPMCQueue.h                   |  17 +-
 dbms/src/Common/MemoryTracker.cpp             |  47 +++-
 dbms/src/Common/MemoryTracker.h               |   9 +-
 dbms/src/Common/tests/gtest_memtracker.cpp    | 121 ++++++++++
 dbms/src/Flash/Coprocessor/DAGContext.cpp     |   4 +
 dbms/src/Flash/Coprocessor/DAGContext.h       |   1 +
 dbms/src/Flash/Coprocessor/StreamWriter.h     |   2 +-
 .../StreamingDAGResponseWriter.cpp            |  70 +++---
 .../Coprocessor/StreamingDAGResponseWriter.h  |   9 +-
 dbms/src/Flash/Mpp/ExchangeReceiver.cpp       |  95 +++++---
 dbms/src/Flash/Mpp/ExchangeReceiver.h         |  10 +-
 dbms/src/Flash/Mpp/GRPCReceiverContext.cpp    |  12 +-
 dbms/src/Flash/Mpp/GRPCReceiverContext.h      |   8 +-
 dbms/src/Flash/Mpp/MPPReceiverSet.cpp         |   9 +
 dbms/src/Flash/Mpp/MPPReceiverSet.h           |   1 +
 dbms/src/Flash/Mpp/MPPTask.cpp                |   3 +-
 dbms/src/Flash/Mpp/MPPTunnel.cpp              |  32 ++-
 dbms/src/Flash/Mpp/MPPTunnel.h                |  17 +-
 dbms/src/Flash/Mpp/MPPTunnelSet.cpp           |  30 ++-
 dbms/src/Flash/Mpp/MPPTunnelSet.h             |   1 +
 dbms/src/Flash/Mpp/TrackedMppDataPacket.h     | 222 ++++++++++++++++++
 dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp  |   6 +-
 dbms/src/Flash/tests/bench_exchange.h         |   2 +-
 dbms/src/Interpreters/ProcessList.cpp         |   1 +
 dbms/src/Interpreters/Settings.h              |   1 +
 dbms/src/Interpreters/executeQuery.cpp        |  12 +
 dbms/src/Server/FlashGrpcServerHolder.cpp     |   2 +
 dbms/src/Server/FlashGrpcServerHolder.h       |   2 +
 30 files changed, 744 insertions(+), 135 deletions(-)
 create mode 100644 dbms/src/Common/BackgroundTask.cpp
 create mode 100644 dbms/src/Common/BackgroundTask.h
 create mode 100644 dbms/src/Common/tests/gtest_memtracker.cpp
 create mode 100644 dbms/src/Flash/Mpp/TrackedMppDataPacket.h
diff --git a/dbms/src/Common/BackgroundTask.cpp b/dbms/src/Common/BackgroundTask.cpp
new file mode 100644
index 00000000000..16a23535541
--- /dev/null
+++ b/dbms/src/Common/BackgroundTask.cpp
@@ -0,0 +1,94 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Common/BackgroundTask.h>
+
+#include <fstream>
+namespace DB
+{
+bool process_mem_usage(double & resident_set)
+{
+    resident_set = 0.0;
+
+    // 'file' stat seems to give the most reliable results
+    std::ifstream stat_stream("/proc/self/stat", std::ios_base::in);
+    // if "/proc/self/stat" is not supported
+    if (!stat_stream.is_open())
+        return false;
+
+    // dummy vars for leading entries in stat that we don't care about
+    std::string pid, comm, state, ppid, pgrp, session, tty_nr;
+    std::string tpgid, flags, minflt, cminflt, majflt, cmajflt;
+    std::string utime, stime, cutime, cstime, priority, nice;
+    std::string proc_num_threads, itrealvalue, starttime;
+    UInt64 vsize;
+
+    // the field we want
+    Int64 rss;
+
+    stat_stream >> pid >> comm >> state >> ppid >> pgrp >> session >> tty_nr
+        >> tpgid >> flags >> minflt >> cminflt >> majflt >> cmajflt
+        >> utime >> stime >> cutime >> cstime >> priority >> nice
+        >> proc_num_threads >> itrealvalue >> starttime >> vsize >> rss; // don't care about the rest
+
+    stat_stream.close();
+
+    Int64 page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
+    resident_set = rss * page_size_kb;
+    return true;
+}
+
+bool isProcStatSupported()
+{
+    std::ifstream stat_stream("/proc/self/stat", std::ios_base::in);
+    return stat_stream.is_open();
+}
+
+void CollectProcInfoBackgroundTask::begin()
+{
+    std::unique_lock lk(mu);
+    if (!is_already_begin)
+    {
+        if (!isProcStatSupported())
+        {
+            end_fin = true;
+            return;
+        }
+        std::thread t = ThreadFactory::newThread(false, "MemTrackThread", &CollectProcInfoBackgroundTask::memCheckJob, this);
+        t.detach();
+        is_already_begin = true;
+    }
+}
+
+void CollectProcInfoBackgroundTask::memCheckJob()
+{
+    double resident_set;
+    while (!end_syn)
+    {
+        process_mem_usage(resident_set);
+        resident_set *= 1024; // unit: byte
+        real_rss = static_cast<Int64>(resident_set);
+
+        usleep(100000); // sleep 100ms
+    }
+    end_fin = true;
+}
+
+void CollectProcInfoBackgroundTask::end()
+{
+    end_syn = true;
+    while (!end_fin)
+        usleep(1000); // Just ok since it is called only when TiFlash shutdown.
+}
+} // namespace DB
diff --git a/dbms/src/Common/BackgroundTask.h b/dbms/src/Common/BackgroundTask.h
new file mode 100644
index 00000000000..afeacccd3bc
--- /dev/null
+++ b/dbms/src/Common/BackgroundTask.h
@@ -0,0 +1,39 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Common/ThreadFactory.h>
+namespace DB
+{
+class CollectProcInfoBackgroundTask
+{
+public:
+    CollectProcInfoBackgroundTask() = default;
+    ~CollectProcInfoBackgroundTask()
+    {
+        end();
+    }
+    void begin();
+
+    void end();
+
+private:
+    void memCheckJob();
+
+    std::mutex mu;
+    bool is_already_begin = false;
+    std::atomic<bool> end_syn{false}, end_fin{false};
+};
+} // namespace DB
diff --git a/dbms/src/Common/MPMCQueue.h b/dbms/src/Common/MPMCQueue.h
index ce6d4365393..db82cad0ff2 100644
--- a/dbms/src/Common/MPMCQueue.h
+++ b/dbms/src/Common/MPMCQueue.h
@@ -81,9 +81,13 @@ class MPMCQueue
 
     ~MPMCQueue()
     {
-        std::unique_lock lock(mu);
-        for (; read_pos < write_pos; ++read_pos)
-            destruct(getObj(read_pos));
+        drain();
+    }
+
+    void finishAndDrain()
+    {
+        finish();
+        drain();
     }
 
     // Cannot to use copy/move constructor,
@@ -418,6 +422,13 @@ class MPMCQueue
             obj.~T();
     }
 
+    void drain()
+    {
+        std::unique_lock lock(mu);
+        for (; read_pos < write_pos; ++read_pos)
+            destruct(getObj(read_pos));
+    }
+
     template <typename F>
     ALWAYS_INLINE bool changeStatus(F && action)
     {
diff --git a/dbms/src/Common/MemoryTracker.cpp b/dbms/src/Common/MemoryTracker.cpp
index f64881ae35a..da8ecf28e26 100644
--- a/dbms/src/Common/MemoryTracker.cpp
+++ b/dbms/src/Common/MemoryTracker.cpp
@@ -22,6 +22,7 @@
 
 #include <iomanip>
 
+std::atomic<Int64> real_rss{0};
 MemoryTracker::~MemoryTracker()
 {
     if (peak)
@@ -46,6 +47,7 @@ MemoryTracker::~MemoryTracker()
       *  then memory usage of 'next' memory trackers will be underestimated,
       *  because amount will be decreased twice (first - here, second - when real 'free' happens).
       */
+    // TODO In future, maybe we can find a better way to handle the "amount > 0" case.
     if (auto value = amount.load(std::memory_order_relaxed))
         free(value);
 }
@@ -80,7 +82,7 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit)
         /// In this case, it doesn't matter.
         if (unlikely(fault_probability && drand48() < fault_probability))
         {
-            free(size);
+            amount.fetch_sub(size, std::memory_order_relaxed);
 
             DB::FmtBuffer fmt_buf;
             fmt_buf.append("Memory tracker");
@@ -93,20 +95,33 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit)
 
             throw DB::TiFlashException(fmt_buf.toString(), DB::Errors::Coprocessor::MemoryLimitExceeded);
         }
-
-        if (unlikely(current_limit && will_be > current_limit))
+        bool is_rss_too_large = (!next.load(std::memory_order_relaxed) && current_limit
+                                 && real_rss > current_limit + bytes_rss_larger_than_limit
+                                 && will_be > current_limit - (real_rss - current_limit - bytes_rss_larger_than_limit));
+        if (is_rss_too_large
+            || unlikely(current_limit && will_be > current_limit))
         {
-            free(size);
+            amount.fetch_sub(size, std::memory_order_relaxed);
 
             DB::FmtBuffer fmt_buf;
             fmt_buf.append("Memory limit");
             if (description)
                 fmt_buf.fmtAppend(" {}", description);
 
-            fmt_buf.fmtAppend(" exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}",
-                              formatReadableSizeWithBinarySuffix(will_be),
-                              size,
-                              formatReadableSizeWithBinarySuffix(current_limit));
+            if (!is_rss_too_large)
+            { // out of memory quota
+                fmt_buf.fmtAppend(" exceeded caused by 'out of memory quota for data computing' : would use {} for data computing (attempt to allocate chunk of {} bytes), limit of memory for data computing: {}",
+                                  formatReadableSizeWithBinarySuffix(will_be),
+                                  size,
+                                  formatReadableSizeWithBinarySuffix(current_limit));
+            }
+            else
+            { // RSS too large
+                fmt_buf.fmtAppend(" exceeded caused by 'RSS(Resident Set Size) much larger than limit' : process memory size would be {} for (attempt to allocate chunk of {} bytes), limit of memory for data computing : {}",
+                                  formatReadableSizeWithBinarySuffix(real_rss),
+                                  size,
+                                  formatReadableSizeWithBinarySuffix(current_limit));
+            }
 
             throw DB::TiFlashException(fmt_buf.toString(), DB::Errors::Coprocessor::MemoryLimitExceeded);
         }
@@ -116,7 +131,17 @@ void MemoryTracker::alloc(Int64 size, bool check_memory_limit)
         peak.store(will_be, std::memory_order_relaxed);
 
     if (auto * loaded_next = next.load(std::memory_order_relaxed))
-        loaded_next->alloc(size, check_memory_limit);
+    {
+        try
+        {
+            loaded_next->alloc(size, check_memory_limit);
+        }
+        catch (...)
+        {
+            amount.fetch_sub(size, std::memory_order_relaxed);
+            std::rethrow_exception(std::current_exception());
+        }
+    }
 }
 
 
@@ -130,7 +155,7 @@ void MemoryTracker::free(Int64 size)
       * Memory usage will be calculated with some error.
       * NOTE The code is not atomic. Not worth to fix.
       */
-    if (new_amount < 0)
+    if (new_amount < 0 && !next.load(std::memory_order_relaxed)) // handle it only for root memory_tracker
     {
         amount.fetch_sub(new_amount);
         size += new_amount;
@@ -170,7 +195,7 @@ thread_local MemoryTracker * current_memory_tracker = nullptr;
 
 namespace CurrentMemoryTracker
 {
-static Int64 MEMORY_TRACER_SUBMIT_THRESHOLD = 8 * 1024 * 1024; // 8 MiB
+static Int64 MEMORY_TRACER_SUBMIT_THRESHOLD = 1024 * 1024; // 1 MiB
 #if __APPLE__ && __clang__
 static __thread Int64 local_delta{};
 #else
diff --git a/dbms/src/Common/MemoryTracker.h b/dbms/src/Common/MemoryTracker.h
index c87ec713dda..3f4122edf9f 100644
--- a/dbms/src/Common/MemoryTracker.h
+++ b/dbms/src/Common/MemoryTracker.h
@@ -19,7 +19,7 @@
 
 #include <atomic>
 
-
+extern std::atomic<Int64> real_rss;
 namespace CurrentMetrics
 {
 extern const Metric MemoryTracking;
@@ -35,6 +35,9 @@ class MemoryTracker
     std::atomic<Int64> peak{0};
     std::atomic<Int64> limit{0};
 
+    // How many bytes RSS(Resident Set Size) can be larger than limit(max_memory_usage_for_all_queries). Default: 5GB
+    Int64 bytes_rss_larger_than_limit = 5368709120;
+
     /// To test exception safety of calling code, memory tracker throws an exception on each memory allocation with specified probability.
     double fault_probability = 0;
 
@@ -70,6 +73,8 @@ class MemoryTracker
 
     Int64 getPeak() const { return peak.load(std::memory_order_relaxed); }
 
+    Int64 getLimit() const { return limit.load(std::memory_order_relaxed); }
+
     void setLimit(Int64 limit_) { limit.store(limit_, std::memory_order_relaxed); }
 
     /** Set limit if it was not set.
@@ -77,6 +82,8 @@ class MemoryTracker
       */
     void setOrRaiseLimit(Int64 value);
 
+    void setBytesThatRssLargerThanLimit(Int64 value) { bytes_rss_larger_than_limit = value; }
+
     void setFaultProbability(double value) { fault_probability = value; }
 
     /// next should be changed only once: from nullptr to some value.
diff --git a/dbms/src/Common/tests/gtest_memtracker.cpp b/dbms/src/Common/tests/gtest_memtracker.cpp
new file mode 100644
index 00000000000..d31e7b42df4
--- /dev/null
+++ b/dbms/src/Common/tests/gtest_memtracker.cpp
@@ -0,0 +1,121 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Common/MemoryTracker.h>
+#include <TestUtils/TiFlashTestBasic.h>
+
+namespace DB::tests
+{
+namespace
+{
+class MemTrackerTest : public ::testing::Test
+{
+};
+
+TEST_F(MemTrackerTest, testBasic)
+try
+{
+    MemoryTracker mem_tracker;
+    mem_tracker.alloc(1024);
+    ASSERT_EQ(1024, mem_tracker.get());
+    mem_tracker.free(1024);
+    ASSERT_EQ(0, mem_tracker.get());
+}
+CATCH
+
+TEST_F(MemTrackerTest, testRootAndChild)
+try
+{
+    MemoryTracker root_mem_tracker;
+    MemoryTracker child_mem_tracker(512);
+    child_mem_tracker.setNext(&root_mem_tracker);
+    // alloc 500
+    child_mem_tracker.alloc(500);
+    ASSERT_EQ(500, child_mem_tracker.get());
+    ASSERT_EQ(500, root_mem_tracker.get());
+
+    // alloc 256 base on 500
+    bool has_err = false;
+    try
+    {
+        child_mem_tracker.alloc(256); //500 + 256 > limit(512)
+    }
+    catch (...)
+    {
+        has_err = true;
+    }
+    ASSERT_TRUE(has_err);
+    ASSERT_EQ(500, child_mem_tracker.get());
+    ASSERT_EQ(500, root_mem_tracker.get());
+
+    //free 500
+    child_mem_tracker.free(500);
+    ASSERT_EQ(0, child_mem_tracker.get());
+    ASSERT_EQ(0, root_mem_tracker.get());
+}
+CATCH
+
+TEST_F(MemTrackerTest, testRootAndMultipleChild)
+try
+{
+    MemoryTracker root(512); // limit 512
+    MemoryTracker child1(512); // limit 512
+    MemoryTracker child2(512); // limit 512
+    child1.setNext(&root);
+    child2.setNext(&root);
+    // alloc 500 on child1
+    child1.alloc(500);
+    ASSERT_EQ(500, child1.get());
+    ASSERT_EQ(0, child2.get());
+    ASSERT_EQ(500, root.get());
+
+
+    // alloc 500 on child2, should fail
+    bool has_err = false;
+    try
+    {
+        child2.alloc(500); // root will throw error because of "out of quota"
+    }
+    catch (...)
+    {
+        has_err = true;
+    }
+    ASSERT_TRUE(has_err);
+    ASSERT_EQ(500, child1.get());
+    ASSERT_EQ(0, child2.get());
+    ASSERT_EQ(500, root.get());
+
+    // alloc 10 on child2
+    child2.alloc(10);
+    ASSERT_EQ(500, child1.get());
+    ASSERT_EQ(10, child2.get());
+    ASSERT_EQ(510, root.get());
+
+    // free 500 on child1
+    child1.free(500);
+    ASSERT_EQ(0, child1.get());
+    ASSERT_EQ(10, child2.get());
+    ASSERT_EQ(10, root.get());
+
+    // free 10 on child2
+    child2.free(10);
+    ASSERT_EQ(0, child1.get());
+    ASSERT_EQ(0, child2.get());
+    ASSERT_EQ(0, root.get());
+}
+CATCH
+
+
+} // namespace
+} // namespace DB::tests
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.cpp b/dbms/src/Flash/Coprocessor/DAGContext.cpp
index 6167090194a..46cb5e3bbf5 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.cpp
+++ b/dbms/src/Flash/Coprocessor/DAGContext.cpp
@@ -240,4 +240,8 @@ const SingleTableRegions & DAGContext::getTableRegionsInfoByTableID(Int64 table_
 {
     return tables_regions_info.getTableRegionInfoByTableID(table_id);
 }
+const MPPReceiverSetPtr & DAGContext::getMppReceiverSet() const
+{
+    return mpp_receiver_set;
+}
 } // namespace DB
diff --git a/dbms/src/Flash/Coprocessor/DAGContext.h b/dbms/src/Flash/Coprocessor/DAGContext.h
index 93e7edda7e8..af5932ac444 100644
--- a/dbms/src/Flash/Coprocessor/DAGContext.h
+++ b/dbms/src/Flash/Coprocessor/DAGContext.h
@@ -305,6 +305,7 @@ class DAGContext
     {
         mpp_receiver_set = receiver_set;
     }
+    const MPPReceiverSetPtr & getMppReceiverSet() const;
     void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
     std::vector<CoprocessorReaderPtr> & getCoprocessorReaders();
 
diff --git a/dbms/src/Flash/Coprocessor/StreamWriter.h b/dbms/src/Flash/Coprocessor/StreamWriter.h
index fad403b0726..41b108492f5 100644
--- a/dbms/src/Flash/Coprocessor/StreamWriter.h
+++ b/dbms/src/Flash/Coprocessor/StreamWriter.h
@@ -54,7 +54,7 @@ struct StreamWriter
     {
         ::coprocessor::BatchResponse resp;
         if (!response.SerializeToString(resp.mutable_data()))
-            throw Exception("Fail to serialize response, response size: " + std::to_string(response.ByteSizeLong()));
+            throw Exception("[StreamWriter]Fail to serialize response, response size: " + std::to_string(response.ByteSizeLong()));
         std::lock_guard lk(write_mutex);
         if (!writer->Write(resp))
             throw Exception("Failed to write resp");
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
index 6e70f280e6f..5fa7eb245ab 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.cpp
@@ -21,6 +21,7 @@
 #include <Flash/Coprocessor/StreamWriter.h>
 #include <Flash/Coprocessor/StreamingDAGResponseWriter.h>
 #include <Flash/Mpp/MPPTunnelSet.h>
+#include <Flash/Mpp/TrackedMppDataPacket.h>
 #include <Interpreters/AggregationCommon.h>
 
 #include <iostream>
@@ -135,62 +136,61 @@ template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
 void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::encodeThenWriteBlocks(
     const std::vector<Block> & input_blocks,
-    tipb::SelectResponse & response) const
+    TrackedSelectResp & response) const
 {
     if (dag_context.encode_type == tipb::EncodeType::TypeCHBlock)
     {
         if (dag_context.isMPPTask()) /// broadcast data among TiFlash nodes in MPP
         {
-            mpp::MPPDataPacket packet;
+            TrackedMppDataPacket tracked_packet(current_memory_tracker);
             if constexpr (send_exec_summary_at_last)
             {
-                serializeToPacket(packet, response);
+                tracked_packet.serializeByResponse(response.getResponse());
             }
             if (input_blocks.empty())
             {
                 if constexpr (send_exec_summary_at_last)
                 {
-                    writer->write(packet);
+                    writer->write(tracked_packet.getPacket());
                 }
                 return;
             }
             for (const auto & block : input_blocks)
             {
                 chunk_codec_stream->encode(block, 0, block.rows());
-                packet.add_chunks(chunk_codec_stream->getString());
+                tracked_packet.addChunk(chunk_codec_stream->getString());
                 chunk_codec_stream->clear();
             }
-            writer->write(packet);
+            writer->write(tracked_packet.getPacket());
         }
         else /// passthrough data to a non-TiFlash node, like sending data to TiSpark
         {
-            response.set_encode_type(dag_context.encode_type);
+            response.setEncodeType(dag_context.encode_type);
             if (input_blocks.empty())
             {
                 if constexpr (send_exec_summary_at_last)
                 {
-                    writer->write(response);
+                    writer->write(response.getResponse());
                 }
                 return;
             }
             for (const auto & block : input_blocks)
             {
                 chunk_codec_stream->encode(block, 0, block.rows());
-                auto * dag_chunk = response.add_chunks();
-                dag_chunk->set_rows_data(chunk_codec_stream->getString());
+                response.addChunk(chunk_codec_stream->getString());
                 chunk_codec_stream->clear();
             }
-            writer->write(response);
+            writer->write(response.getResponse());
         }
     }
     else /// passthrough data to a TiDB node
     {
-        response.set_encode_type(dag_context.encode_type);
+        response.setEncodeType(dag_context.encode_type);
         if (input_blocks.empty())
         {
             if constexpr (send_exec_summary_at_last)
             {
-                writer->write(response);
+                writer->write(response.getResponse());
             }
             return;
         }
@@ -203,8 +203,7 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::e
             {
                 if (current_records_num >= records_per_chunk)
                 {
-                    auto * dag_chunk = response.add_chunks();
-                    dag_chunk->set_rows_data(chunk_codec_stream->getString());
+                    response.addChunk(chunk_codec_stream->getString());
                     chunk_codec_stream->clear();
                     current_records_num = 0;
                 }
@@ -217,11 +216,10 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::e
 
         if (current_records_num > 0)
         {
-            auto * dag_chunk = response.add_chunks();
-            dag_chunk->set_rows_data(chunk_codec_stream->getString());
+            response.addChunk(chunk_codec_stream->getString());
             chunk_codec_stream->clear();
         }
-        writer->write(response);
+        writer->write(response.getResponse());
     }
 }
 
@@ -230,9 +228,9 @@ template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
 void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::batchWrite()
 {
-    tipb::SelectResponse response;
+    TrackedSelectResp response;
     if constexpr (send_exec_summary_at_last)
-        addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
+        addExecuteSummaries(response.getResponse(), !dag_context.isMPPTask() || dag_context.isRootMPPTask());
     if (exchange_type == tipb::ExchangeType::Hash)
     {
         partitionAndEncodeThenWriteBlocks<send_exec_summary_at_last>(blocks, response);
@@ -249,13 +247,13 @@ template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
 void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::handleExecSummary(
     const std::vector<Block> & input_blocks,
-    std::vector<mpp::MPPDataPacket> & packet,
+    std::vector<TrackedMppDataPacket> & packets,
     tipb::SelectResponse & response) const
 {
     if constexpr (send_exec_summary_at_last)
     {
         /// Sending the response to only one node, default the first one.
-        serializeToPacket(packet[0], response);
+        packets[0].serializeByResponse(response);
 
         // No need to send data when blocks are not empty,
         // because exec_summary will be sent together with blocks.
@@ -263,7 +261,7 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::h
         {
             for (auto part_id = 0; part_id < partition_num; ++part_id)
             {
-                writer->write(packet[part_id], part_id);
+                writer->write(packets[part_id].getPacket(), part_id);
             }
         }
     }
@@ -273,18 +271,18 @@ template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
 void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::writePackets(
     const std::vector<size_t> & responses_row_count,
-    std::vector<mpp::MPPDataPacket> & packets) const
+    std::vector<TrackedMppDataPacket> & packets) const
 {
     for (size_t part_id = 0; part_id < packets.size(); ++part_id)
     {
         if constexpr (send_exec_summary_at_last)
         {
-            writer->write(packets[part_id], part_id);
+            writer->write(packets[part_id].getPacket(), part_id);
         }
         else
         {
             if (responses_row_count[part_id] > 0)
-                writer->write(packets[part_id], part_id);
+                writer->write(packets[part_id].getPacket(), part_id);
         }
     }
 }
@@ -354,12 +352,12 @@ template <class StreamWriterPtr, bool enable_fine_grained_shuffle>
 template <bool send_exec_summary_at_last>
 void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::partitionAndEncodeThenWriteBlocks(
     std::vector<Block> & input_blocks,
-    tipb::SelectResponse & response) const
+    TrackedSelectResp & response) const
 {
     static_assert(!enable_fine_grained_shuffle);
-    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<TrackedMppDataPacket> tracked_packets(partition_num);
     std::vector<size_t> responses_row_count(partition_num);
-    handleExecSummary<send_exec_summary_at_last>(input_blocks, packet, response);
+    handleExecSummary<send_exec_summary_at_last>(input_blocks, tracked_packets, response.getResponse());
     if (input_blocks.empty())
         return;
 
@@ -378,12 +376,12 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::p
             dest_block.setColumns(std::move(dest_tbl_cols[part_id]));
             responses_row_count[part_id] += dest_block.rows();
             chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
-            packet[part_id].add_chunks(chunk_codec_stream->getString());
+            tracked_packets[part_id].addChunk(chunk_codec_stream->getString());
             chunk_codec_stream->clear();
         }
     }
 
-    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
+    writePackets<send_exec_summary_at_last>(responses_row_count, tracked_packets);
 }
 
 /// Hash exchanging data among only TiFlash nodes. Only be called when enable_fine_grained_shuffle is true.
@@ -399,12 +397,12 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::b
     if constexpr (send_exec_summary_at_last)
         addExecuteSummaries(response, !dag_context.isMPPTask() || dag_context.isRootMPPTask());
 
-    std::vector<mpp::MPPDataPacket> packet(partition_num);
+    std::vector<TrackedMppDataPacket> tracked_packets(partition_num);
     std::vector<size_t> responses_row_count(partition_num, 0);
 
     // fine_grained_shuffle_stream_count is in [0, 1024], and partition_num is uint16_t, so will not overflow.
     uint32_t bucket_num = partition_num * fine_grained_shuffle_stream_count;
-    handleExecSummary<send_exec_summary_at_last>(blocks, packet, response);
+    handleExecSummary<send_exec_summary_at_last>(blocks, tracked_packets, response);
     if (!blocks.empty())
     {
         std::vector<MutableColumns> final_dest_tbl_columns(bucket_num);
@@ -441,15 +439,15 @@ void StreamingDAGResponseWriter<StreamWriterPtr, enable_fine_grained_shuffle>::b
                 row_count_per_part += dest_block.rows();
 
                 chunk_codec_stream->encode(dest_block, 0, dest_block.rows());
-                packet[part_id].add_chunks(chunk_codec_stream->getString());
-                packet[part_id].add_stream_ids(stream_idx);
+                tracked_packets[part_id].addChunk(chunk_codec_stream->getString());
+                tracked_packets[part_id].packet.add_stream_ids(stream_idx);
                 chunk_codec_stream->clear();
             }
             responses_row_count[part_id] = row_count_per_part;
         }
     }
 
-    writePackets<send_exec_summary_at_last>(responses_row_count, packet);
+    writePackets<send_exec_summary_at_last>(responses_row_count, tracked_packets);
 
     blocks.clear();
     rows_in_blocks = 0;
diff --git a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
index cd7559d1e79..1e37090509b 100644
--- a/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
+++ b/dbms/src/Flash/Coprocessor/StreamingDAGResponseWriter.h
@@ -23,6 +23,7 @@
 #include <common/logger_useful.h>
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
+#include <Flash/Mpp/TrackedMppDataPacket.h>
 #include <common/ThreadPool.h>
 #include <tipb/select.pb.h>
 
@@ -58,16 +59,16 @@ class StreamingDAGResponseWriter : public DAGResponseWriter
     void batchWriteFineGrainedShuffle();
 
     template <bool send_exec_summary_at_last>
-    void encodeThenWriteBlocks(const std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
+    void encodeThenWriteBlocks(const std::vector<Block> & input_blocks, TrackedSelectResp & response) const;
     template <bool send_exec_summary_at_last>
-    void partitionAndEncodeThenWriteBlocks(std::vector<Block> & input_blocks, tipb::SelectResponse & response) const;
+    void partitionAndEncodeThenWriteBlocks(std::vector<Block> & input_blocks, TrackedSelectResp & response) const;
 
     template <bool send_exec_summary_at_last>
     void handleExecSummary(const std::vector<Block> & input_blocks,
-                           std::vector<mpp::MPPDataPacket> & packet,
+                           std::vector<TrackedMppDataPacket> & packet,
                            tipb::SelectResponse & response) const;
     template <bool send_exec_summary_at_last>
-    void writePackets(const std::vector<size_t> & responses_row_count, std::vector<mpp::MPPDataPacket> & packets) const;
+    void writePackets(const std::vector<size_t> & responses_row_count, std::vector<TrackedMppDataPacket> & packets) const;
 
     Int64 batch_send_min_limit;
     bool should_send_exec_summary_at_last; /// only one stream needs to sending execution summaries at last.
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
index c896757c84a..3cdb0c2a184 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.cpp
@@ -59,26 +59,27 @@ String getReceiverStateStr(const ExchangeReceiverState & s)
 template <bool enable_fine_grained_shuffle, bool is_sync>
 bool pushPacket(size_t source_index,
                 const String & req_info,
-                MPPDataPacketPtr & packet,
+                const TrackedMppDataPacketPtr & tracked_packet,
                 const std::vector<MsgChannelPtr> & msg_channels,
                 LoggerPtr & log)
 {
     bool push_succeed = true;
 
     const mpp::Error * error_ptr = nullptr;
-    if (packet->has_error())
-        error_ptr = &packet->error();
+    auto & packet = tracked_packet->packet;
+    if (packet.has_error())
+        error_ptr = &packet.error();
     const String * resp_ptr = nullptr;
-    if (!packet->data().empty())
-        resp_ptr = &packet->data();
+    if (!packet.data().empty())
+        resp_ptr = &packet.data();
 
     if constexpr (enable_fine_grained_shuffle)
     {
         std::vector<std::vector<const String *>> chunks(msg_channels.size());
-        if (!packet->chunks().empty())
+        if (!packet.chunks().empty())
         {
             // Packet not empty.
-            if (unlikely(packet->stream_ids().empty()))
+            if (unlikely(packet.stream_ids().empty()))
             {
                 // Fine grained shuffle is enabled in receiver, but sender didn't. We cannot handle this, so return error.
                 // This can happen when there are old version nodes when upgrading.
@@ -90,12 +91,12 @@ bool pushPacket(size_t source_index,
             }
             // packet.stream_ids[i] is corresponding to packet.chunks[i],
             // indicating which stream_id this chunk belongs to.
-            assert(packet->chunks_size() == packet->stream_ids_size());
+            assert(packet.chunks_size() == packet.stream_ids_size());
 
-            for (int i = 0; i < packet->stream_ids_size(); ++i)
+            for (int i = 0; i < packet.stream_ids_size(); ++i)
             {
-                UInt64 stream_id = packet->stream_ids(i) % msg_channels.size();
-                chunks[stream_id].push_back(&packet->chunks(i));
+                UInt64 stream_id = packet.stream_ids(i) % msg_channels.size();
+                chunks[stream_id].push_back(&packet.chunks(i));
             }
         }
         // Still need to send error_ptr or resp_ptr even if packet.chunks_size() is zero.
@@ -107,7 +108,7 @@ bool pushPacket(size_t source_index,
             std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
                 source_index,
                 req_info,
-                packet,
+                tracked_packet,
                 error_ptr,
                 resp_ptr,
                 std::move(chunks[i]));
@@ -123,10 +124,10 @@ bool pushPacket(size_t source_index,
     }
     else
     {
-        std::vector<const String *> chunks(packet->chunks_size());
-        for (int i = 0; i < packet->chunks_size(); ++i)
+        std::vector<const String *> chunks(packet.chunks_size());
+        for (int i = 0; i < packet.chunks_size(); ++i)
         {
-            chunks[i] = &packet->chunks(i);
+            chunks[i] = &packet.chunks(i);
         }
 
         if (!(resp_ptr == nullptr && error_ptr == nullptr && chunks.empty()))
@@ -134,7 +135,7 @@ bool pushPacket(size_t source_index,
             std::shared_ptr<ReceivedMessage> recv_msg = std::make_shared<ReceivedMessage>(
                 source_index,
                 req_info,
-                packet,
+                tracked_packet,
                 error_ptr,
                 resp_ptr,
                 std::move(chunks));
@@ -190,7 +191,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
     {
         packets.resize(batch_packet_count);
         for (auto & packet : packets)
-            packet = std::make_shared<MPPDataPacket>();
+            packet = std::make_shared<TrackedMppDataPacket>();
 
         start();
     }
@@ -212,7 +213,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
         case AsyncRequestStage::WAIT_BATCH_READ:
             if (ok)
                 ++read_packet_index;
-            if (!ok || read_packet_index == batch_packet_count || packets[read_packet_index - 1]->has_error())
+            if (!ok || read_packet_index == batch_packet_count || packets[read_packet_index - 1]->hasError())
                 notifyReactor();
             else
                 reader->read(packets[read_packet_index], thisAsUnaryCallback());
@@ -228,6 +229,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
     // handle will be called by ExchangeReceiver::reactor.
     void handle()
     {
+        std::string err_info;
         LOG_FMT_TRACE(log, "stage: {}", stage);
         switch (stage)
         {
@@ -251,8 +253,8 @@ class AsyncRequestHandler : public UnaryCallback<bool>
 
             if (auto packet = getErrorPacket())
                 setDone("Exchange receiver meet error : " + packet->error().msg());
-            else if (!sendPackets())
-                setDone("Exchange receiver meet error : push packets fail");
+            else if (!sendPackets(err_info))
+                setDone("Exchange receiver meet error : push packets fail, " + err_info);
             else if (read_packet_index < batch_packet_count)
             {
                 stage = AsyncRequestStage::WAIT_FINISH;
@@ -314,10 +316,10 @@ class AsyncRequestHandler : public UnaryCallback<bool>
         notify_queue->push(this);
     }
 
-    MPPDataPacketPtr getErrorPacket() const
+    TrackedMppDataPacketPtr getErrorPacket() const
     {
         // only the last packet may has error, see execute().
-        if (read_packet_index != 0 && packets[read_packet_index - 1]->has_error())
+        if (read_packet_index != 0 && packets[read_packet_index - 1]->hasError())
             return packets[read_packet_index - 1];
         return nullptr;
     }
@@ -357,15 +359,31 @@ class AsyncRequestHandler : public UnaryCallback<bool>
             setDone(done_msg);
     }
 
-    bool sendPackets()
+    bool sendPackets(std::string & err_info)
     {
+        // note: no exception should be thrown rudely, since it's called by a GRPC poller.
         for (size_t i = 0; i < read_packet_index; ++i)
         {
             auto & packet = packets[i];
-            if (!pushPacket<enable_fine_grained_shuffle, false>(request->source_index, req_info, packet, *msg_channels, log))
+            // We shouldn't throw error directly, since the caller works in a standalone thread.
+            try
+            {
+                packet->recomputeTrackedMem();
+                if (!pushPacket<enable_fine_grained_shuffle, false>(
+                        request->source_index,
+                        req_info,
+                        packet,
+                        *msg_channels,
+                        log))
+                    return false;
+            }
+            catch (...)
+            {
+                err_info = getCurrentExceptionMessage(false);
                 return false;
+            }
             // can't reuse packet since it is sent to readers.
-            packet = std::make_shared<MPPDataPacket>();
+            packet = std::make_shared<TrackedMppDataPacket>();
         }
         return true;
     }
@@ -390,7 +408,7 @@ class AsyncRequestHandler : public UnaryCallback<bool>
     AsyncRequestStage stage = AsyncRequestStage::NEED_INIT;
 
     std::shared_ptr<AsyncReader> reader;
-    MPPDataPacketPtrs packets;
+    TrackedMPPDataPacketPtrs packets;
     size_t read_packet_index = 0;
     Status finish_status = RPCContext::getStatusOK();
     LoggerPtr log;
@@ -405,7 +423,8 @@ ExchangeReceiverBase<RPCContext>::ExchangeReceiverBase(
     size_t max_streams_,
     const String & req_id,
     const String & executor_id,
-    uint64_t fine_grained_shuffle_stream_count_)
+    uint64_t fine_grained_shuffle_stream_count_,
+    bool setup_conn_manually)
     : rpc_context(std::move(rpc_context_))
     , source_num(source_num_)
     , max_streams(max_streams_)
@@ -431,7 +450,11 @@ ExchangeReceiverBase<RPCContext>::ExchangeReceiverBase(
             msg_channels.push_back(std::make_unique<MPMCQueue<std::shared_ptr<ReceivedMessage>>>(max_buffer_size));
         }
         rpc_context->fillSchema(schema);
-        setUpConnection();
+        if (!setup_conn_manually)
+        {
+            // In CH client case, we need setUpConn right now. However, MPPTask will setUpConnection manually after ProcEntry is created.
+            setUpConnection();
+        }
     }
     catch (...)
     {
@@ -479,6 +502,8 @@ void ExchangeReceiverBase<RPCContext>::close()
 template <typename RPCContext>
 void ExchangeReceiverBase<RPCContext>::setUpConnection()
 {
+    if (thread_count)
+        return;
     std::vector<Request> async_requests;
 
     for (size_t index = 0; index < source_num; ++index)
@@ -601,15 +626,20 @@ void ExchangeReceiverBase<RPCContext>::readLoop(const Request & req)
             for (;;)
             {
                 LOG_FMT_TRACE(log, "begin next ");
-                MPPDataPacketPtr packet = std::make_shared<MPPDataPacket>();
+                TrackedMppDataPacketPtr packet = std::make_shared<TrackedMppDataPacket>();
                 bool success = reader->read(packet);
                 if (!success)
                     break;
                 has_data = true;
-                if (packet->has_error())
+                if (packet->hasError())
                     throw Exception("Exchange receiver meet error : " + packet->error().msg());
 
-                if (!pushPacket<enable_fine_grained_shuffle, true>(req.source_index, req_info, packet, msg_channels, log))
+                if (!pushPacket<enable_fine_grained_shuffle, true>(
+                        req.source_index,
+                        req_info,
+                        packet,
+                        msg_channels,
+                        log))
                 {
                     meet_error = true;
                     auto local_state = getState();
@@ -681,9 +711,10 @@ DecodeDetail ExchangeReceiverBase<RPCContext>::decodeChunks(
 
     if (recv_msg->chunks.empty())
         return detail;
+    auto & packet = recv_msg->packet->packet;
 
     // Record total packet size even if fine grained shuffle is enabled.
-    detail.packet_bytes = recv_msg->packet->ByteSizeLong();
+    detail.packet_bytes = packet.ByteSizeLong();
 
     for (const String * chunk : recv_msg->chunks)
     {
diff --git a/dbms/src/Flash/Mpp/ExchangeReceiver.h b/dbms/src/Flash/Mpp/ExchangeReceiver.h
index 9213eb76e60..8eed7878545 100644
--- a/dbms/src/Flash/Mpp/ExchangeReceiver.h
+++ b/dbms/src/Flash/Mpp/ExchangeReceiver.h
@@ -38,7 +38,7 @@ struct ReceivedMessage
     size_t source_index;
     String req_info;
     // shared_ptr<const MPPDataPacket> is copied to make sure error_ptr, resp_ptr and chunks are valid.
-    const std::shared_ptr<const MPPDataPacket> packet;
+    const std::shared_ptr<DB::TrackedMppDataPacket> packet;
     const mpp::Error * error_ptr;
     const String * resp_ptr;
     std::vector<const String *> chunks;
@@ -46,7 +46,7 @@ struct ReceivedMessage
     // Constructor that move chunks.
     ReceivedMessage(size_t source_index_,
                     const String & req_info_,
-                    const std::shared_ptr<const MPPDataPacket> & packet_,
+                    const std::shared_ptr<DB::TrackedMppDataPacket> & packet_,
                     const mpp::Error * error_ptr_,
                     const String * resp_ptr_,
                     std::vector<const String *> && chunks_)
@@ -129,10 +129,13 @@ class ExchangeReceiverBase
         size_t max_streams_,
         const String & req_id,
         const String & executor_id,
-        uint64_t fine_grained_shuffle_stream_count);
+        uint64_t fine_grained_shuffle_stream_count,
+        bool setup_conn_manually = false);
 
     ~ExchangeReceiverBase();
 
+    void setUpConnection();
+
     void cancel();
 
     void close();
@@ -166,7 +169,6 @@ class ExchangeReceiverBase
 private:
     using Request = typename RPCContext::Request;
 
-    void setUpConnection();
     // Template argument enable_fine_grained_shuffle will be setup properly in setUpConnection().
     template <bool enable_fine_grained_shuffle>
     void readLoop(const Request & req);
diff --git a/dbms/src/Flash/Mpp/GRPCReceiverContext.cpp b/dbms/src/Flash/Mpp/GRPCReceiverContext.cpp
index 310745aa024..236d15f9093 100644
--- a/dbms/src/Flash/Mpp/GRPCReceiverContext.cpp
+++ b/dbms/src/Flash/Mpp/GRPCReceiverContext.cpp
@@ -63,9 +63,9 @@ struct GrpcExchangePacketReader : public ExchangePacketReader
         call = std::make_shared<pingcap::kv::RpcCall<mpp::EstablishMPPConnectionRequest>>(req.req);
     }
 
-    bool read(MPPDataPacketPtr & packet) override
+    bool read(TrackedMppDataPacketPtr & packet) override
     {
-        return reader->Read(packet.get());
+        return packet->read(reader);
     }
 
     ::grpc::Status finish() override
@@ -101,9 +101,9 @@ struct AsyncGrpcExchangePacketReader : public AsyncExchangePacketReader
             callback);
     }
 
-    void read(MPPDataPacketPtr & packet, UnaryCallback<bool> * callback) override
+    void read(TrackedMppDataPacketPtr & packet, UnaryCallback<bool> * callback) override
     {
-        reader->Read(packet.get(), callback);
+        packet->read(reader, callback);
     }
 
     void finish(::grpc::Status & status, UnaryCallback<bool> * callback) override
@@ -131,9 +131,9 @@ struct LocalExchangePacketReader : public ExchangePacketReader
         }
     }
 
-    bool read(MPPDataPacketPtr & packet) override
+    bool read(TrackedMppDataPacketPtr & packet) override
     {
-        MPPDataPacketPtr tmp_packet = local_tunnel_sender->readForLocal();
+        TrackedMppDataPacketPtr tmp_packet = local_tunnel_sender->readForLocal();
         bool success = tmp_packet != nullptr;
         if (success)
             packet = tmp_packet;
diff --git a/dbms/src/Flash/Mpp/GRPCReceiverContext.h b/dbms/src/Flash/Mpp/GRPCReceiverContext.h
index 3868271ff8a..a6e9afb059b 100644
--- a/dbms/src/Flash/Mpp/GRPCReceiverContext.h
+++ b/dbms/src/Flash/Mpp/GRPCReceiverContext.h
@@ -28,14 +28,14 @@
 namespace DB
 {
 using MPPDataPacket = mpp::MPPDataPacket;
-using MPPDataPacketPtr = std::shared_ptr<MPPDataPacket>;
-using MPPDataPacketPtrs = std::vector<MPPDataPacketPtr>;
+using TrackedMppDataPacketPtr = std::shared_ptr<DB::TrackedMppDataPacket>;
+using TrackedMPPDataPacketPtrs = std::vector<TrackedMppDataPacketPtr>;
 
 class ExchangePacketReader
 {
 public:
     virtual ~ExchangePacketReader() = default;
-    virtual bool read(MPPDataPacketPtr & packet) = 0;
+    virtual bool read(TrackedMppDataPacketPtr & packet) = 0;
     virtual ::grpc::Status finish() = 0;
 };
 using ExchangePacketReaderPtr = std::shared_ptr<ExchangePacketReader>;
@@ -45,7 +45,7 @@ class AsyncExchangePacketReader
 public:
     virtual ~AsyncExchangePacketReader() = default;
     virtual void init(UnaryCallback<bool> * callback) = 0;
-    virtual void read(MPPDataPacketPtr & packet, UnaryCallback<bool> * callback) = 0;
+    virtual void read(TrackedMppDataPacketPtr & packet, UnaryCallback<bool> * callback) = 0;
     virtual void finish(::grpc::Status & status, UnaryCallback<bool> * callback) = 0;
 };
 using AsyncExchangePacketReaderPtr = std::shared_ptr<AsyncExchangePacketReader>;
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.cpp b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
index fd8da091224..c433d8aa9a9 100644
--- a/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.cpp
@@ -44,6 +44,15 @@ void MPPReceiverSet::cancel()
         cop_reader->cancel();
 }
 
+
+void MPPReceiverSet::setUpConnection()
+{
+    for (auto & it : exchange_receiver_map)
+    {
+        it.second->setUpConnection();
+    }
+}
+
 void MPPReceiverSet::close()
 {
     for (auto & it : exchange_receiver_map)
diff --git a/dbms/src/Flash/Mpp/MPPReceiverSet.h b/dbms/src/Flash/Mpp/MPPReceiverSet.h
index 367fd6859ce..71ae8606e8f 100644
--- a/dbms/src/Flash/Mpp/MPPReceiverSet.h
+++ b/dbms/src/Flash/Mpp/MPPReceiverSet.h
@@ -28,6 +28,7 @@ class MPPReceiverSet
     void addExchangeReceiver(const String & executor_id, const ExchangeReceiverPtr & exchange_receiver);
     void addCoprocessorReader(const CoprocessorReaderPtr & coprocessor_reader);
     ExchangeReceiverPtr getExchangeReceiver(const String & executor_id) const;
+    void setUpConnection();
     void cancel();
     void close();
 
diff --git a/dbms/src/Flash/Mpp/MPPTask.cpp b/dbms/src/Flash/Mpp/MPPTask.cpp
index 1444d2f1963..121352595aa 100644
--- a/dbms/src/Flash/Mpp/MPPTask.cpp
+++ b/dbms/src/Flash/Mpp/MPPTask.cpp
@@ -201,7 +201,8 @@ void MPPTask::initExchangeReceivers()
                 context->getMaxStreams(),
                 log->identifier(),
                 executor_id,
-                executor.fine_grained_shuffle_stream_count());
+                executor.fine_grained_shuffle_stream_count(),
+                true);
             if (status != RUNNING)
                 throw Exception("exchange receiver map can not be initialized, because the task is not in running state");
 
diff --git a/dbms/src/Flash/Mpp/MPPTunnel.cpp b/dbms/src/Flash/Mpp/MPPTunnel.cpp
index b1a25095ba0..7a9f4cdea76 100644
--- a/dbms/src/Flash/Mpp/MPPTunnel.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnel.cpp
@@ -68,8 +68,9 @@ MPPTunnel::MPPTunnel(
     , status(TunnelStatus::Unconnected)
     , timeout(timeout_)
     , tunnel_id(tunnel_id_)
-    , send_queue(std::make_shared<MPMCQueue<MPPDataPacketPtr>>(std::max(5, input_steams_num_ * 5))) // MPMCQueue can benefit from a slightly larger queue size
+    , send_queue(std::make_shared<MPMCQueue<TrackedMppDataPacketPtr>>(std::max(5, input_steams_num_ * 5))) // MPMCQueue can benefit from a slightly larger queue size
     , log(Logger::get("MPPTunnel", req_id, tunnel_id))
+    , mem_tracker(current_memory_tracker)
 {
     RUNTIME_ASSERT(!(is_local_ && is_async_), log, "is_local: {}, is_async: {}.", is_local_, is_async_);
     if (is_local_)
@@ -109,6 +110,10 @@ void MPPTunnel::finishSendQueue()
 /// exit abnormally, such as being cancelled.
 void MPPTunnel::close(const String & reason)
 {
+    SCOPE_EXIT({
+        // ensure the tracked memory is released and udpated before memotry tracker(in ProcListEntry) is released
+        send_queue->finishAndDrain(); // drain the send_queue when close
+    });
     {
         std::unique_lock lk(*mu);
         switch (status)
@@ -124,7 +129,7 @@ void MPPTunnel::close(const String & reason)
                 try
                 {
                     FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::exception_during_mpp_close_tunnel);
-                    send_queue->push(std::make_shared<mpp::MPPDataPacket>(getPacketWithError(reason)));
+                    send_queue->push(std::make_shared<DB::TrackedMppDataPacket>(getPacketWithError(reason), mem_tracker));
                     if (mode == TunnelSenderMode::ASYNC_GRPC)
                         async_tunnel_sender->tryFlushOne();
                 }
@@ -159,7 +164,7 @@ void MPPTunnel::write(const mpp::MPPDataPacket & data, bool close_after_write)
         if (status == TunnelStatus::Finished)
             throw Exception(fmt::format("write to tunnel which is already closed,{}", tunnel_sender ? tunnel_sender->getConsumerFinishMsg() : ""));
 
-        if (send_queue->push(std::make_shared<mpp::MPPDataPacket>(data)) == MPMCQueueResult::OK)
+        if (send_queue->push(std::make_shared<DB::TrackedMppDataPacket>(data, mem_tracker)) == MPMCQueueResult::OK)
         {
             connection_profile_info.bytes += data.ByteSizeLong();
             connection_profile_info.packets += 1;
@@ -301,6 +306,11 @@ StringRef MPPTunnel::statusToString()
     }
 }
 
+void MPPTunnel::updateMemTracker()
+{
+    mem_tracker = current_memory_tracker;
+}
+
 void TunnelSender::consumerFinish(const String & msg)
 {
     LOG_FMT_TRACE(log, "calling consumer Finish");
@@ -321,10 +331,10 @@ void SyncTunnelSender::sendJob()
     String err_msg;
     try
     {
-        MPPDataPacketPtr res;
+        TrackedMppDataPacketPtr res;
         while (send_queue->pop(res) == MPMCQueueResult::OK)
         {
-            if (!writer->write(*res))
+            if (!writer->write(res->packet))
             {
                 err_msg = "grpc writes failed.";
                 break;
@@ -379,11 +389,11 @@ void AsyncTunnelSender::sendOne(bool use_lock)
     bool queue_empty_flag = false;
     try
     {
-        MPPDataPacketPtr res;
+        TrackedMppDataPacketPtr res;
         queue_empty_flag = send_queue->pop(res) != MPMCQueueResult::OK;
         if (!queue_empty_flag)
         {
-            if (!writer->write(*res))
+            if (!writer->write(res->packet))
             {
                 err_msg = "grpc writes failed.";
             }
@@ -413,11 +423,15 @@ void AsyncTunnelSender::sendOne(bool use_lock)
     }
 }
 
-LocalTunnelSender::MPPDataPacketPtr LocalTunnelSender::readForLocal()
+std::shared_ptr<DB::TrackedMppDataPacket> LocalTunnelSender::readForLocal()
 {
-    MPPDataPacketPtr res;
+    TrackedMppDataPacketPtr res;
     if (send_queue->pop(res) == MPMCQueueResult::OK)
+    {
+        // switch tunnel's memory tracker into receiver's
+        res->switchMemTracker(current_memory_tracker);
         return res;
+    }
     consumerFinish("");
     return nullptr;
 }
diff --git a/dbms/src/Flash/Mpp/MPPTunnel.h b/dbms/src/Flash/Mpp/MPPTunnel.h
index 5243c9aaf36..e6e6ad94bbd 100644
--- a/dbms/src/Flash/Mpp/MPPTunnel.h
+++ b/dbms/src/Flash/Mpp/MPPTunnel.h
@@ -33,6 +33,8 @@
 #include <kvproto/tikvpb.grpc.pb.h>
 #pragma GCC diagnostic pop
 
+#include <Flash/Mpp/TrackedMppDataPacket.h>
+
 #include <boost/noncopyable.hpp>
 #include <chrono>
 #include <condition_variable>
@@ -61,8 +63,8 @@ enum class TunnelSenderMode
 class TunnelSender : private boost::noncopyable
 {
 public:
-    using MPPDataPacketPtr = std::shared_ptr<mpp::MPPDataPacket>;
-    using DataPacketMPMCQueuePtr = std::shared_ptr<MPMCQueue<MPPDataPacketPtr>>;
+    using TrackedMppDataPacketPtr = std::shared_ptr<DB::TrackedMppDataPacket>;
+    using DataPacketMPMCQueuePtr = std::shared_ptr<MPMCQueue<TrackedMppDataPacketPtr>>;
     virtual ~TunnelSender() = default;
     TunnelSender(TunnelSenderMode mode_, DataPacketMPMCQueuePtr send_queue_, PacketWriter * writer_, const LoggerPtr log_, const String & tunnel_id_)
         : mode(mode_)
@@ -177,7 +179,7 @@ class LocalTunnelSender : public TunnelSender
 public:
     using Base = TunnelSender;
     using Base::Base;
-    MPPDataPacketPtr readForLocal();
+    TrackedMppDataPacketPtr readForLocal();
 };
 
 using TunnelSenderPtr = std::shared_ptr<TunnelSender>;
@@ -202,7 +204,7 @@ using LocalTunnelSenderPtr = std::shared_ptr<LocalTunnelSender>;
  * To be short: before connect, only close can finish a MPPTunnel; after connect, only Sender Finish can.
  *
  * Each MPPTunnel has a Sender to consume data. There're three kinds of senders: sync_remote, local and async_remote.
- * 
+ *
  * The protocol between MPPTunnel and Sender:
  * - All data will be pushed into the `send_queue`, including errors.
  * - MPPTunnel may finish `send_queue` to notify Sender normally finish.
@@ -259,6 +261,8 @@ class MPPTunnel : private boost::noncopyable
 
     const LoggerPtr & getLogger() const { return log; }
 
+    void updateMemTracker();
+
     TunnelSenderPtr getTunnelSender() { return tunnel_sender; }
     SyncTunnelSenderPtr getSyncTunnelSender() { return sync_tunnel_sender; }
     AsyncTunnelSenderPtr getAsyncTunnelSender() { return async_tunnel_sender; }
@@ -292,11 +296,12 @@ class MPPTunnel : private boost::noncopyable
     // tunnel id is in the format like "tunnel[sender]+[receiver]"
     String tunnel_id;
 
-    using MPPDataPacketPtr = std::shared_ptr<mpp::MPPDataPacket>;
-    using DataPacketMPMCQueuePtr = std::shared_ptr<MPMCQueue<MPPDataPacketPtr>>;
+    using TrackedMppDataPacketPtr = std::shared_ptr<DB::TrackedMppDataPacket>;
+    using DataPacketMPMCQueuePtr = std::shared_ptr<MPMCQueue<TrackedMppDataPacketPtr>>;
     DataPacketMPMCQueuePtr send_queue;
     ConnectionProfileInfo connection_profile_info;
     const LoggerPtr log;
+    MemoryTracker * mem_tracker;
     TunnelSenderMode mode; // Tunnel transfer data mode
     TunnelSenderPtr tunnel_sender; // Used to refer to one of sync/async/local_tunnel_sender which is not nullptr, just for coding convenience
     // According to mode value, among the sync/async/local_tunnel_senders, only the responding sender is not null and do actual work
diff --git a/dbms/src/Flash/Mpp/MPPTunnelSet.cpp b/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
index 3de5af31091..e6610d4d7b8 100644
--- a/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
+++ b/dbms/src/Flash/Mpp/MPPTunnelSet.cpp
@@ -15,6 +15,7 @@
 #include <Common/Exception.h>
 #include <Common/FailPoint.h>
 #include <Flash/Mpp/MPPTunnelSet.h>
+#include <Flash/Mpp/TrackedMppDataPacket.h>
 #include <Flash/Mpp/Utils.h>
 #include <fmt/core.h>
 
@@ -26,13 +27,6 @@ extern const char exception_during_mpp_write_err_to_tunnel[];
 } // namespace FailPoints
 namespace
 {
-inline mpp::MPPDataPacket serializeToPacket(const tipb::SelectResponse & response)
-{
-    mpp::MPPDataPacket packet;
-    if (!response.SerializeToString(packet.mutable_data()))
-        throw Exception(fmt::format("Fail to serialize response, response size: {}", response.ByteSizeLong()));
-    return packet;
-}
 
 void checkPacketSize(size_t size)
 {
@@ -57,11 +51,19 @@ void MPPTunnelSetBase<Tunnel>::clearExecutionSummaries(tipb::SelectResponse & re
     }
 }
 
+template <typename Tunnel>
+void MPPTunnelSetBase<Tunnel>::updateMemTracker()
+{
+    for (size_t i = 0; i < tunnels.size(); ++i)
+        tunnels[i]->updateMemTracker();
+}
+
 template <typename Tunnel>
 void MPPTunnelSetBase<Tunnel>::write(tipb::SelectResponse & response)
 {
-    auto packet = serializeToPacket(response);
-    tunnels[0]->write(packet);
+    TrackedMppDataPacket tracked_packet;
+    tracked_packet.serializeByResponse(response);
+    tunnels[0]->write(tracked_packet.getPacket());
 
     if (tunnels.size() > 1)
     {
@@ -69,10 +71,11 @@ void MPPTunnelSetBase<Tunnel>::write(tipb::SelectResponse & response)
         if (response.execution_summaries_size() > 0)
         {
             clearExecutionSummaries(response);
-            packet = serializeToPacket(response);
+            tracked_packet = TrackedMppDataPacket();
+            tracked_packet.serializeByResponse(response);
         }
         for (size_t i = 1; i < tunnels.size(); ++i)
-            tunnels[i]->write(packet);
+            tunnels[i]->write(tracked_packet.getPacket());
     }
 }
 
@@ -98,10 +101,11 @@ void MPPTunnelSetBase<Tunnel>::write(mpp::MPPDataPacket & packet)
 template <typename Tunnel>
 void MPPTunnelSetBase<Tunnel>::write(tipb::SelectResponse & response, int16_t partition_id)
 {
+    TrackedMppDataPacket tracked_packet;
     if (partition_id != 0 && response.execution_summaries_size() > 0)
         clearExecutionSummaries(response);
-
-    tunnels[partition_id]->write(serializeToPacket(response));
+    tracked_packet.serializeByResponse(response);
+    tunnels[partition_id]->write(tracked_packet.getPacket());
 }
 
 template <typename Tunnel>
diff --git a/dbms/src/Flash/Mpp/MPPTunnelSet.h b/dbms/src/Flash/Mpp/MPPTunnelSet.h
index e4123db1be5..da37423876e 100644
--- a/dbms/src/Flash/Mpp/MPPTunnelSet.h
+++ b/dbms/src/Flash/Mpp/MPPTunnelSet.h
@@ -58,6 +58,7 @@ class MPPTunnelSetBase : private boost::noncopyable
     void close(const String & reason);
     void finishWrite();
     void registerTunnel(const MPPTaskId & id, const TunnelPtr & tunnel);
+    void updateMemTracker();
 
     TunnelPtr getTunnelByReceiverTaskId(const MPPTaskId & id);
 
diff --git a/dbms/src/Flash/Mpp/TrackedMppDataPacket.h b/dbms/src/Flash/Mpp/TrackedMppDataPacket.h
new file mode 100644
index 00000000000..7cb2103d4f3
--- /dev/null
+++ b/dbms/src/Flash/Mpp/TrackedMppDataPacket.h
@@ -0,0 +1,222 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Common/Exception.h>
+#include <Common/Logger.h>
+#include <common/logger_useful.h>
+#include <common/types.h>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+#include <Common/MemoryTracker.h>
+#include <grpcpp/server_context.h>
+#include <kvproto/mpp.pb.h>
+#include <kvproto/tikvpb.grpc.pb.h>
+#include <tipb/select.pb.h>
+#pragma GCC diagnostic pop
+#include <Common/UnaryCallback.h>
+
+#include <memory>
+
+namespace DB
+{
+inline size_t estimateAllocatedSize(const mpp::MPPDataPacket & data)
+{
+    size_t ret = data.data().size();
+    for (int i = 0; i < data.chunks_size(); i++)
+    {
+        ret += data.chunks(i).size();
+    }
+    return ret;
+}
+
+struct MemTrackerWrapper
+{
+    MemTrackerWrapper(size_t _size, MemoryTracker * memory_tracker)
+        : memory_tracker(memory_tracker)
+        , size(0)
+    {
+        alloc(_size);
+    }
+
+    explicit MemTrackerWrapper(MemoryTracker * memory_tracker)
+        : memory_tracker(memory_tracker)
+        , size(0)
+    {}
+
+    void alloc(size_t delta)
+    {
+        if (delta)
+        {
+            if (memory_tracker)
+            {
+                memory_tracker->alloc(delta);
+                size += delta;
+            }
+        }
+    }
+
+    void free(size_t delta)
+    {
+        if (delta)
+        {
+            if (memory_tracker)
+            {
+                memory_tracker->free(delta);
+                size -= delta;
+            }
+        }
+    }
+
+    void switchMemTracker(MemoryTracker * new_memory_tracker)
+    {
+        int bak_size = size;
+        freeAll();
+        memory_tracker = new_memory_tracker;
+        alloc(bak_size);
+    }
+    ~MemTrackerWrapper()
+    {
+        freeAll();
+    }
+
+    void freeAll()
+    {
+        free(size);
+    }
+    MemoryTracker * memory_tracker;
+    size_t size = 0;
+};
+
+struct TrackedMppDataPacket
+{
+    explicit TrackedMppDataPacket(const mpp::MPPDataPacket & data, MemoryTracker * memory_tracker)
+        : mem_tracker_wrapper(estimateAllocatedSize(data), memory_tracker)
+    {
+        packet = data;
+    }
+
+    explicit TrackedMppDataPacket()
+        : mem_tracker_wrapper(current_memory_tracker)
+    {}
+
+    explicit TrackedMppDataPacket(MemoryTracker * memory_tracker)
+        : mem_tracker_wrapper(memory_tracker)
+    {}
+
+    void addChunk(std::string && value)
+    {
+        mem_tracker_wrapper.alloc(value.size());
+        packet.add_chunks(std::move(value));
+    }
+
+    void serializeByResponse(const tipb::SelectResponse & response)
+    {
+        mem_tracker_wrapper.alloc(response.ByteSizeLong());
+        if (!response.SerializeToString(packet.mutable_data()))
+        {
+            mem_tracker_wrapper.free(response.ByteSizeLong());
+            throw Exception(fmt::format("Fail to serialize response, response size: {}", response.ByteSizeLong()));
+        }
+    }
+
+    void read(const std::unique_ptr<::grpc::ClientAsyncReader<::mpp::MPPDataPacket>> & reader, void * callback)
+    {
+        reader->Read(&packet, callback);
+        need_recompute = true;
+        //we shouldn't update tracker now, since it's an async reader!!
+    }
+
+    // we need recompute in some cases we can't update memory counter timely, such as async read
+    void recomputeTrackedMem()
+    {
+        if (need_recompute)
+        {
+            mem_tracker_wrapper.freeAll();
+            mem_tracker_wrapper.alloc(estimateAllocatedSize(packet));
+            need_recompute = false;
+        }
+    }
+
+    bool read(const std::unique_ptr<::grpc::ClientReader<::mpp::MPPDataPacket>> & reader)
+    {
+        bool ret = reader->Read(&packet);
+        mem_tracker_wrapper.freeAll();
+        mem_tracker_wrapper.alloc(estimateAllocatedSize(packet));
+        return ret;
+    }
+
+    void switchMemTracker(MemoryTracker * new_memory_tracker)
+    {
+        mem_tracker_wrapper.switchMemTracker(new_memory_tracker);
+    }
+
+    bool hasError() const
+    {
+        return packet.has_error();
+    }
+
+    const ::mpp::Error & error() const
+    {
+        return packet.error();
+    }
+
+    mpp::MPPDataPacket & getPacket()
+    {
+        return packet;
+    }
+
+    MemTrackerWrapper mem_tracker_wrapper;
+    mpp::MPPDataPacket packet;
+    bool need_recompute = false;
+};
+
+struct TrackedSelectResp
+{
+    explicit TrackedSelectResp()
+        : memory_tracker(current_memory_tracker)
+    {}
+
+    void addChunk(std::string && value)
+    {
+        memory_tracker.alloc(value.size());
+        auto * dag_chunk = response.add_chunks();
+        dag_chunk->set_rows_data(std::move(value));
+    }
+
+    tipb::SelectResponse & getResponse()
+    {
+        return response;
+    }
+
+    void setEncodeType(::tipb::EncodeType value)
+    {
+        response.set_encode_type(value);
+    }
+
+    tipb::ExecutorExecutionSummary * addExecutionSummary()
+    {
+        return response.add_execution_summaries();
+    }
+
+    MemTrackerWrapper memory_tracker;
+    tipb::SelectResponse response;
+};
+
+} // namespace DB
diff --git a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
index ae720badb68..59409e5dd0b 100644
--- a/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
+++ b/dbms/src/Flash/Mpp/tests/gtest_mpptunnel.cpp
@@ -77,11 +77,11 @@ struct MockLocalReader
     {
         while (true)
         {
-            MPPDataPacketPtr tmp_packet = local_sender->readForLocal();
+            TrackedMppDataPacketPtr tmp_packet = local_sender->readForLocal();
             bool success = tmp_packet != nullptr;
             if (success)
             {
-                write_packet_vec.push_back(tmp_packet->data());
+                write_packet_vec.push_back(tmp_packet->packet.data());
             }
             else
             {
@@ -118,7 +118,7 @@ struct MockTerminateLocalReader
 
     void read() const
     {
-        MPPDataPacketPtr tmp_packet = local_sender->readForLocal();
+        TrackedMppDataPacketPtr tmp_packet = local_sender->readForLocal();
         local_sender->consumerFinish("Receiver closed");
     }
 };
diff --git a/dbms/src/Flash/tests/bench_exchange.h b/dbms/src/Flash/tests/bench_exchange.h
index d8300d45740..cb4d62aeebd 100644
--- a/dbms/src/Flash/tests/bench_exchange.h
+++ b/dbms/src/Flash/tests/bench_exchange.h
@@ -96,7 +96,7 @@ struct MockReceiverContext
     {
         // Not implement benchmark for Async GRPC for now.
         void init(UnaryCallback<bool> *) { assert(0); }
-        void read(MPPDataPacketPtr &, UnaryCallback<bool> *) { assert(0); }
+        void read(TrackedMppDataPacketPtr &, UnaryCallback<bool> *) { assert(0); }
         void finish(::grpc::Status &, UnaryCallback<bool> *) { assert(0); }
     };
 
diff --git a/dbms/src/Interpreters/ProcessList.cpp b/dbms/src/Interpreters/ProcessList.cpp
index 0f667cfd396..5e50d560bb0 100644
--- a/dbms/src/Interpreters/ProcessList.cpp
+++ b/dbms/src/Interpreters/ProcessList.cpp
@@ -162,6 +162,7 @@ ProcessList::EntryPtr ProcessList::insert(
             ///  not for specific users, sessions or queries,
             ///  because this setting is effectively global.
             total_memory_tracker.setOrRaiseLimit(settings.max_memory_usage_for_all_queries);
+            total_memory_tracker.setBytesThatRssLargerThanLimit(settings.bytes_that_rss_larger_than_limit);
             total_memory_tracker.setDescription("(total)");
             user_process_list.user_memory_tracker.setNext(&total_memory_tracker);
         }
diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h
index 6442f9c8dd6..52fec1c4c1a 100644
--- a/dbms/src/Interpreters/Settings.h
+++ b/dbms/src/Interpreters/Settings.h
@@ -341,6 +341,7 @@ struct Settings
     M(SettingUInt64, max_memory_usage, 0, "Maximum memory usage for processing of single query. Zero means unlimited.")                                                                                                                 \
     M(SettingUInt64, max_memory_usage_for_user, 0, "Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited.")                                                                          \
     M(SettingUInt64, max_memory_usage_for_all_queries, 0, "Maximum memory usage for processing all concurrently running queries on the server. Zero means unlimited.")                                                                  \
+    M(SettingUInt64, bytes_that_rss_larger_than_limit, 5368709120, "How many bytes RSS(Resident Set Size) can be larger than limit(max_memory_usage_for_all_queries). Default: 5GB ")                                                   \
                                                                                                                                                                                                                                         \
     M(SettingUInt64, max_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for a query. Zero means unlimited.")                                                                            \
     M(SettingUInt64, max_network_bytes, 0, "The maximum number of bytes (compressed) to receive or transmit over the network for execution of the query.")                                                                              \
diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp
index 77a3a76d842..fc2103d7838 100644
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@@ -22,6 +22,8 @@
 #include <DataStreams/InputStreamFromASTInsertQuery.h>
 #include <DataStreams/copyData.h>
 #include <Flash/Coprocessor/DAGContext.h>
+#include <Flash/Mpp/MPPReceiverSet.h>
+#include <Flash/Mpp/MPPTunnelSet.h>
 #include <IO/ConcatReadBuffer.h>
 #include <IO/WriteBufferFromFile.h>
 #include <Interpreters/IQuerySource.h>
@@ -231,6 +233,16 @@ std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             context.setProcessListElement(&process_list_entry->get());
         }
 
+        // Do set-up work for tunnels and receivers after ProcessListEntry is constructed,
+        // so that we can propagate current_memory_tracker into them.
+        if (context.getDAGContext()) // When using TiFlash client, dag context will be nullptr in this case.
+        {
+            if (context.getDAGContext()->tunnel_set)
+                context.getDAGContext()->tunnel_set->updateMemTracker();
+            if (context.getDAGContext()->getMppReceiverSet())
+                context.getDAGContext()->getMppReceiverSet()->setUpConnection();
+        }
+
         FAIL_POINT_TRIGGER_EXCEPTION(FailPoints::random_interpreter_failpoint);
         auto interpreter = query_src.interpreter(context, stage);
         res = interpreter->execute();
diff --git a/dbms/src/Server/FlashGrpcServerHolder.cpp b/dbms/src/Server/FlashGrpcServerHolder.cpp
index 1190985004d..c359db24298 100644
--- a/dbms/src/Server/FlashGrpcServerHolder.cpp
+++ b/dbms/src/Server/FlashGrpcServerHolder.cpp
@@ -84,6 +84,7 @@ FlashGrpcServerHolder::FlashGrpcServerHolder(Context & context, Poco::Util::Laye
     : log(log_)
     , is_shutdown(std::make_shared<std::atomic<bool>>(false))
 {
+    background_task.begin();
     grpc::ServerBuilder builder;
     if (security_config.has_tls_config)
     {
@@ -190,6 +191,7 @@ FlashGrpcServerHolder::~FlashGrpcServerHolder()
         LOG_FMT_INFO(log, "Begin to shut down flash service");
         flash_service.reset();
         LOG_FMT_INFO(log, "Shut down flash service");
+        background_task.end();
     }
     catch (...)
     {
diff --git a/dbms/src/Server/FlashGrpcServerHolder.h b/dbms/src/Server/FlashGrpcServerHolder.h
index 57146f40aae..054138de246 100644
--- a/dbms/src/Server/FlashGrpcServerHolder.h
+++ b/dbms/src/Server/FlashGrpcServerHolder.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include <Common/BackgroundTask.h>
 #include <Common/assert_cast.h>
 #include <Debug/astToExecutor.h>
 #include <Flash/DiagnosticsService.h>
@@ -49,6 +50,7 @@ class FlashGrpcServerHolder
     std::vector<std::unique_ptr<grpc::ServerCompletionQueue>> cqs;
     std::vector<std::unique_ptr<grpc::ServerCompletionQueue>> notify_cqs;
     std::shared_ptr<ThreadManager> thread_manager;
+    CollectProcInfoBackgroundTask background_task;
 };
 
 } // namespace DB
\ No newline at end of file