diff --git a/source/source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu b/source/source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu
index ad148deb23..1724a5e2eb 100644
--- a/source/source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu
+++ b/source/source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu
@@ -37,7 +37,7 @@ struct mul_potential_op<std::complex<FPTYPE>, base_device::DEVICE_GPU>
         int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
 
         mul_potential_kernel<<<num_blocks, threads_per_block>>>(
-            pot + ik * nks * npw + iq * npw,
+            pot,
             reinterpret_cast<thrust::complex<FPTYPE>*>(density_recip),
             npw);
 
diff --git a/source/source_pw/module_pwdft/kernels/mul_potential_op.cpp b/source/source_pw/module_pwdft/kernels/mul_potential_op.cpp
index 85647e065d..53cf7a6ff3 100644
--- a/source/source_pw/module_pwdft/kernels/mul_potential_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/mul_potential_op.cpp
@@ -14,8 +14,8 @@ struct mul_potential_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>
         #endif
         for (int ig = 0; ig < npw; ig++)
         {
-            int ig_kq = ik * nks * npw + iq * npw + ig;
-            density_recip[ig] *= pot[ig_kq];
+            // int ig_kq = ik * nks * npw + iq * npw + ig;
+            density_recip[ig] *= pot[ig];
 
         }
     }
diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp
index 6019ac110c..89918e16c8 100644
--- a/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp
+++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp
@@ -11,7 +11,10 @@ void get_exx_potential(const K_Vectors* kv,
                        Real* pot,
                        double tpiba,
                        bool gamma_extrapolation,
-                       double ucell_omega)
+                       double ucell_omega,
+                       int ik,
+                       int iq,
+                       bool is_stress)
 {
     using setmem_real_cpu_op = base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>;
     using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
@@ -23,180 +26,168 @@ void get_exx_potential(const K_Vectors* kv,
     Real* pot_cpu = nullptr;
     int nks = wfcpw->nks, npw = rhopw_dev->npw;
     double tpiba2 = tpiba * tpiba;
-    pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks];
+    pot_cpu = new Real[npw];
     // fill zero
-    setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks);
+    setmem_real_cpu_op()(pot_cpu, 0, npw);
 
     // calculate Fock pot
     auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
-    for (auto param: param_fock)
+    for (int i = 0; i < param_fock.size(); i++)
     {
-        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
-                                        0.0,
-                                        kv,
-                                        wfcpw,
-                                        rhopw_dev,
-                                        tpiba,
-                                        gamma_extrapolation,
-                                        ucell_omega);
+        auto param = param_fock[i];
+        double exx_div = OperatorEXXPW<std::complex<Real>, Device>::fock_div[i];
         double alpha = std::stod(param["alpha"]);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+        const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+        const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        {
+            const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+            const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+            // grid_factor is designed for the 7/8 of the grid to function like all of the points
+            Real grid_factor = 1;
+            double extrapolate_grid = 8.0 / 7.0;
+            if (gamma_extrapolation)
+            {
+                // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                auto isint = [](double x) {
+                    double epsilon = 1e-6; // this follows the isint judgement in q-e
+                    return std::abs(x - std::round(x)) < epsilon;
+                };
+                if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
                 {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += fac * grid_factor * alpha;
-                    }
-                    // }
-                    else
-                    {
-                        pot_cpu[ig_kq] += exx_div * alpha;
-                    }
-                    // assert(is_finite(density_recip[ig]));
+                    grid_factor = 0;
                 }
+                else
+                {
+                    grid_factor = extrapolate_grid;
+                }
+            }
+
+            const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+            const int nk = nks / nk_fac;
+
+            Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+            // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+            if (gg >= 1e-8)
+            {
+                Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                pot_cpu[ig] += fac * grid_factor * alpha;
             }
+            // }
+            else
+            {
+                pot_cpu[ig] += exx_div * alpha;
+            }
+            // assert(is_finite(density_recip[ig]));
         }
     }
 
     // calculate erfc pot
     auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc];
-    for (auto param: param_erfc)
+    for (int i = 0; i < param_erfc.size(); i++)
     {
+        auto param = param_erfc[i];
         double erfc_omega = std::stod(param["omega"]);
         double erfc_omega2 = erfc_omega * erfc_omega;
         double alpha = std::stod(param["alpha"]);
+        // double exx_div = OperatorEXXPW<std::complex<Real>, Device>::erfc_div[i];
         double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
-                                        erfc_omega,
-                                        kv,
-                                        wfcpw,
-                                        rhopw_dev,
-                                        tpiba,
-                                        gamma_extrapolation,
-                                        ucell_omega);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+                                          erfc_omega,
+                                          kv,
+                                          wfcpw,
+                                          rhopw_dev,
+                                          tpiba,
+                                          gamma_extrapolation,
+                                          ucell_omega);
+        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+        const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+        const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        {
+            const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+            const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+            // grid_factor is designed for the 7/8 of the grid to function like all of the points
+            Real grid_factor = 1;
+            double extrapolate_grid = 8.0 / 7.0;
+            if (gamma_extrapolation)
+            {
+                // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                auto isint = [](double x) {
+                    double epsilon = 1e-6; // this follows the isint judgement in q-e
+                    return std::abs(x - std::round(x)) < epsilon;
+                };
+                if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
+                {
+                    grid_factor = 0;
+                }
+                else
+                {
+                    grid_factor = extrapolate_grid;
+                }
+            }
+
+            const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+            const int nk = nks / nk_fac;
+            // const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+            Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+            // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+            if (gg >= 1e-8)
+            {
+                Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                pot_cpu[ig] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha;
+            }
+            // }
+            else
+            {
+                // if (PARAM.inp.dft_functional == "hse")
+                if (!gamma_extrapolation)
                 {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha;
-                    }
-                    // }
+                    if (is_stress)
+                        pot_cpu[ig] += (- ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha;
                     else
-                    {
-                        // if (PARAM.inp.dft_functional == "hse")
-                        if (!gamma_extrapolation)
-                        {
-                            pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha;
-                        }
-                        else
-                        {
-                            pot_cpu[ig_kq] += exx_div * alpha;
-                        }
-                    }
-                    // assert(is_finite(density_recip[ig]));
+                        pot_cpu[ig] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha;
+                }
+                else
+                {
+                    pot_cpu[ig] += exx_div * alpha;
                 }
             }
+            // assert(is_finite(density_recip[ig]));
         }
     }
 
     // copy the potential to the device memory
-    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks);
+    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw);
 
     delete pot_cpu;
 }
 
 template <typename Real, typename Device>
 void get_exx_stress_potential(const K_Vectors* kv,
-                       const ModulePW::PW_Basis_K* wfcpw,
-                       ModulePW::PW_Basis* rhopw_dev,
-                       Real* pot,
-                       double tpiba,
-                       bool gamma_extrapolation,
-                       double ucell_omega)
+                              const ModulePW::PW_Basis_K* wfcpw,
+                              ModulePW::PW_Basis* rhopw_dev,
+                              Real* pot,
+                              double tpiba,
+                              bool gamma_extrapolation,
+                              double ucell_omega,
+                              int ik,
+                              int iq)
 {
     using setmem_real_cpu_op = base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>;
     using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
@@ -208,74 +199,69 @@ void get_exx_stress_potential(const K_Vectors* kv,
     Real* pot_cpu = nullptr;
     int nks = wfcpw->nks, npw = rhopw_dev->npw;
     double tpiba2 = tpiba * tpiba;
-    pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks];
+    pot_cpu = new Real[npw];
     // fill zero
-    setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks);
+    setmem_real_cpu_op()(pot_cpu, 0, npw);
 
     // calculate Fock pot
     auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
     for (auto param: param_fock)
     {
-        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
-                                        0.0,
-                                        kv,
-                                        wfcpw,
-                                        rhopw_dev,
-                                        tpiba,
-                                        gamma_extrapolation,
-                                        ucell_omega);
+        // double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
+        //                                 0.0,
+        //                                 kv,
+        //                                 wfcpw,
+        //                                 rhopw_dev,
+        //                                 tpiba,
+        //                                 gamma_extrapolation,
+        //                                 ucell_omega);
         double alpha = std::stod(param["alpha"]);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+
+        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+        const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+        const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        {
+            const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+            const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+            // grid_factor is designed for the 7/8 of the grid to function like all of the points
+            Real grid_factor = 1;
+            double extrapolate_grid = 8.0 / 7.0;
+            if (gamma_extrapolation)
+            {
+                // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                auto isint = [](double x) {
+                    double epsilon = 1e-6; // this follows the isint judgement in q-e
+                    return std::abs(x - std::round(x)) < epsilon;
+                };
+                if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
                 {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += 1.0 / gg * grid_factor * alpha;
-                    }
+                    grid_factor = 0;
+                }
+                else
+                {
+                    grid_factor = extrapolate_grid;
                 }
             }
+
+            const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+            const int nk = nks / nk_fac;
+            // const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+            Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+            // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+            if (gg >= 1e-8)
+            {
+                Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                pot_cpu[ig] += 1.0 / gg * grid_factor * alpha;
+            }
         }
     }
 
@@ -286,80 +272,76 @@ void get_exx_stress_potential(const K_Vectors* kv,
         double erfc_omega = std::stod(param["omega"]);
         double erfc_omega2 = erfc_omega * erfc_omega;
         double alpha = std::stod(param["alpha"]);
-        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
-                                        erfc_omega,
-                                        kv,
-                                        wfcpw,
-                                        rhopw_dev,
-                                        tpiba,
-                                        gamma_extrapolation,
-                                        ucell_omega);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+        // double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
+        //                                 erfc_omega,
+        //                                 kv,
+        //                                 wfcpw,
+        //                                 rhopw_dev,
+        //                                 tpiba,
+        //                                 gamma_extrapolation,
+        //                                 ucell_omega);
+
+        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+        const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+        const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
 
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        {
+            const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+            const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+            // grid_factor is designed for the 7/8 of the grid to function like all of the points
+            Real grid_factor = 1;
+            double extrapolate_grid = 8.0 / 7.0;
+            if (gamma_extrapolation)
+            {
+                // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                auto isint = [](double x) {
+                    double epsilon = 1e-6; // this follows the isint judgement in q-e
+                    return std::abs(x - std::round(x)) < epsilon;
+                };
+                if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
                 {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += (1.0 - (1.0 + gg / 4.0 / erfc_omega2) * std::exp(-gg / 4.0 / erfc_omega2)) / (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) / gg * grid_factor * alpha;
-                    }
-                    // }
-                    else
-                    {
-                        // if (PARAM.inp.dft_functional == "hse")
-                        if (!gamma_extrapolation)
-                        {
-                            pot_cpu[ig_kq] += 1.0 / 4.0 / erfc_omega2 * alpha;
-                        }
-                    }
-                    // assert(is_finite(density_recip[ig]));
+                    grid_factor = 0;
+                }
+                else
+                {
+                    grid_factor = extrapolate_grid;
                 }
             }
+
+            const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+            const int nk = nks / nk_fac;
+            // const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+            Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+            // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+            if (gg >= 1e-8)
+            {
+                Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                pot_cpu[ig] += (1.0 - (1.0 + gg / 4.0 / erfc_omega2) * std::exp(-gg / 4.0 / erfc_omega2))
+                               / (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) / gg * grid_factor * alpha;
+            }
+            // }
+            else
+            {
+                // if (PARAM.inp.dft_functional == "hse")
+                if (!gamma_extrapolation)
+                {
+                    pot_cpu[ig] += 1.0 / 4.0 / erfc_omega2 * alpha;
+                }
+            }
+            // assert(is_finite(density_recip[ig]));
         }
     }
 
     // copy the potential to the device memory
-    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks);
+    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw);
 
     delete pot_cpu;
 }
@@ -495,28 +477,38 @@ template void get_exx_potential<float, base_device::DEVICE_CPU>(const K_Vectors*
                                                                 float*,
                                                                 double,
                                                                 bool,
-                                                                double);
+                                                                double,
+                                                                int,
+                                                                int,
+                                                                        bool);
 template void get_exx_potential<double, base_device::DEVICE_CPU>(const K_Vectors*,
                                                                  const ModulePW::PW_Basis_K*,
                                                                  ModulePW::PW_Basis*,
                                                                  double*,
                                                                  double,
                                                                  bool,
-                                                                 double);
+                                                                 double,
+                                                                 int,
+                                                                 int,
+                                                                        bool);
 template void get_exx_stress_potential<float, base_device::DEVICE_CPU>(const K_Vectors*,
-                                                                const ModulePW::PW_Basis_K*,
-                                                                ModulePW::PW_Basis*,
-                                                                float*,
-                                                                double,
-                                                                bool,
-                                                                double);
+                                                                       const ModulePW::PW_Basis_K*,
+                                                                       ModulePW::PW_Basis*,
+                                                                       float*,
+                                                                       double,
+                                                                       bool,
+                                                                       double,
+                                                                       int,
+                                                                       int);
 template void get_exx_stress_potential<double, base_device::DEVICE_CPU>(const K_Vectors*,
-                                                                 const ModulePW::PW_Basis_K*,
-                                                                 ModulePW::PW_Basis*,
-                                                                 double*,
-                                                                 double,
-                                                                 bool,
-                                                                 double);
+                                                                        const ModulePW::PW_Basis_K*,
+                                                                        ModulePW::PW_Basis*,
+                                                                        double*,
+                                                                        double,
+                                                                        bool,
+                                                                        double,
+                                                                        int,
+                                                                        int);
 #if ((defined __CUDA) || (defined __ROCM))
 template class OperatorEXXPW<std::complex<float>, base_device::DEVICE_GPU>;
 template class OperatorEXXPW<std::complex<double>, base_device::DEVICE_GPU>;
@@ -526,27 +518,37 @@ template void get_exx_potential<float, base_device::DEVICE_GPU>(const K_Vectors*
                                                                 float*,
                                                                 double,
                                                                 bool,
-                                                                double);
+                                                                double,
+                                                                int,
+                                                                int,
+                                                                bool);
 template void get_exx_potential<double, base_device::DEVICE_GPU>(const K_Vectors*,
                                                                  const ModulePW::PW_Basis_K*,
                                                                  ModulePW::PW_Basis*,
                                                                  double*,
                                                                  double,
                                                                  bool,
-                                                                 double);
+                                                                 double,
+                                                                 int,
+                                                                 int,
+                                                                 bool);
 template void get_exx_stress_potential<float, base_device::DEVICE_GPU>(const K_Vectors*,
-                                                                const ModulePW::PW_Basis_K*,
-                                                                ModulePW::PW_Basis*,
-                                                                float*,
-                                                                double,
-                                                                bool,
-                                                                double);
+                                                                       const ModulePW::PW_Basis_K*,
+                                                                       ModulePW::PW_Basis*,
+                                                                       float*,
+                                                                       double,
+                                                                       bool,
+                                                                       double,
+                                                                       int,
+                                                                       int);
 template void get_exx_stress_potential<double, base_device::DEVICE_GPU>(const K_Vectors*,
-                                                                 const ModulePW::PW_Basis_K*,
-                                                                 ModulePW::PW_Basis*,
-                                                                 double*,
-                                                                 double,
-                                                                 bool,
-                                                                 double);
+                                                                        const ModulePW::PW_Basis_K*,
+                                                                        ModulePW::PW_Basis*,
+                                                                        double*,
+                                                                        double,
+                                                                        bool,
+                                                                        double,
+                                                                        int,
+                                                                        int);
 #endif
 } // namespace hamilt
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
index 83879d634f..404c660877 100644
--- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
+++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
@@ -23,6 +23,11 @@
 
 namespace hamilt
 {
+template <typename T, typename Device>
+std::vector<typename GetTypeReal<T>::type> OperatorEXXPW<T, Device>::fock_div = {};
+
+template <typename T, typename Device>
+std::vector<typename GetTypeReal<T>::type> OperatorEXXPW<T, Device>::erfc_div = {};
 
 template <typename T, typename Device>
 OperatorEXXPW<T, Device>::OperatorEXXPW(const int* isk_in,
@@ -66,7 +71,7 @@ OperatorEXXPW<T, Device>::OperatorEXXPW(const int* isk_in,
 
     int nks = wfcpw->nks;
     int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-    resmem_real_op()(pot, rhopw->npw * nks * nks);
+    resmem_real_op()(pot, rhopw->npw);
 
     tpiba = ucell->tpiba;
     Real tpiba2 = tpiba * tpiba;
@@ -83,6 +88,31 @@ OperatorEXXPW<T, Device>::OperatorEXXPW(const int* isk_in,
     rhopw_dev->setuptransform();
     rhopw_dev->collect_local_pw();
 
+    auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
+    for (auto param: param_fock)
+    {
+        fock_div.push_back(exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
+                                          0.0,
+                                          kv,
+                                          wfcpw,
+                                          rhopw_dev,
+                                          tpiba,
+                                          gamma_extrapolation,
+                                          ucell->omega));
+    }
+    auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc];
+    for (auto param: param_erfc)
+    {
+        erfc_div.push_back(exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
+                                          std::stod(param["omega"]),
+                                          kv,
+                                          wfcpw,
+                                          rhopw_dev,
+                                          tpiba,
+                                          gamma_extrapolation,
+                                          ucell->omega));
+    }
+
 }   // end of constructor
 
 template <typename T, typename Device>
@@ -168,11 +198,7 @@ void OperatorEXXPW<T, Device>::act_op(const int nbands,
 //              << " ngk_ik: " << ngk_ik
 //              << " is_first_node: " << is_first_node
 //              << std::endl;
-    if (!potential_got)
-    {
-        get_exx_potential<Real, Device>(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega);
-        potential_got = true;
-    }
+    // get_exx_potential<Real, Device>(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega, ik, iq);
 
 //    set_psi(&p_exx_helper->psi);
 
@@ -199,6 +225,7 @@ void OperatorEXXPW<T, Device>::act_op(const int nbands,
         Real nqs = q_points.size();
         for (int iq: q_points)
         {
+            get_exx_potential<Real, Device>(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega, this->ik, iq);
 //            std::cout << "ik" << this->ik << " iq" << iq << std::endl;
             for (int m_iband = 0; m_iband < psi.get_nbands(); m_iband++)
             {
@@ -421,6 +448,7 @@ double OperatorEXXPW<T, Device>::cal_exx_energy_op(psi::Psi<T, Device> *ppsi_) c
 
             for (int iq: q_points)
             {
+                get_exx_potential<Real, Device>(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega, ik, iq);
                 for (int m_iband = 0; m_iband < psi.get_nbands(); m_iband++)
                 {
                     // double wg_f = GlobalC::exx_helper.wg(iq, m_iband);
@@ -441,7 +469,7 @@ double OperatorEXXPW<T, Device>::cal_exx_energy_op(psi::Psi<T, Device> *ppsi_) c
                     int nks = wfcpw->nks;
                     int npw = rhopw_dev->npw;
                     int nk = nks / nk_fac;
-                    Eexx_ik_real += exx_cal_energy_op<T, Device>()(density_recip, pot + ik * nks * npw + iq * npw, wg_iqb_real / nqs * wg_ikb_real / kv->wk[ik], npw);
+                    Eexx_ik_real += exx_cal_energy_op<T, Device>()(density_recip, pot, wg_iqb_real / nqs * wg_ikb_real / kv->wk[ik], npw);
 
                 } // m_iband
 
diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
index 577db11b04..d823eda015 100644
--- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
+++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
@@ -55,13 +55,14 @@ class OperatorEXXPW : public OperatorPW<T, Device>
 
     bool first_iter = true;
 
+    static std::vector<Real> fock_div, erfc_div;
+
   private:
     const int* isk = nullptr;
     const ModulePW::PW_Basis_K* wfcpw = nullptr;
     const ModulePW::PW_Basis* rhopw = nullptr;
     ModulePW::PW_Basis* rhopw_dev = nullptr; // for device
     const UnitCell *ucell = nullptr;
-//    Real exx_div = 0;
     Real tpiba = 0;
     
     std::vector<int> get_q_points(const int ik) const;
@@ -167,7 +168,10 @@ void get_exx_potential(const K_Vectors* kv,
                        Real* pot,
                        double tpiba,
                        bool gamma_extrapolation,
-                       double ucell_omega);
+                       double ucell_omega,
+                       int ik,
+                       int iq,
+                       bool is_stress = false);
 
 template <typename Real, typename Device>
 void get_exx_stress_potential(const K_Vectors* kv,
@@ -176,7 +180,9 @@ void get_exx_stress_potential(const K_Vectors* kv,
                               Real* pot,
                               double tpiba,
                               bool gamma_extrapolation,
-                              double ucell_omega);
+                              double ucell_omega,
+                              int ik,
+                              int iq);
 
 double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type,
                       double erfc_omega,
diff --git a/source/source_pw/module_pwdft/stress_func_exx.cpp b/source/source_pw/module_pwdft/stress_func_exx.cpp
index 4aef3c872f..dd2c6570a0 100644
--- a/source/source_pw/module_pwdft/stress_func_exx.cpp
+++ b/source/source_pw/module_pwdft/stress_func_exx.cpp
@@ -1,5 +1,6 @@
 #include "global.h"
 #include "operator_pw/op_exx_pw.h"
+#include "source_base/parallel_common.h"
 #include "stress_pw.h"
 
 template <typename FPTYPE, typename Device>
@@ -10,19 +11,15 @@ void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
                                            const K_Vectors *p_kv,
                                            const psi::Psi <std::complex<FPTYPE>, Device>* d_psi_in, const UnitCell& ucell)
 {
-    double nqs_half1 = 0.5 * p_kv->nmp[0];
-    double nqs_half2 = 0.5 * p_kv->nmp[1];
-    double nqs_half3 = 0.5 * p_kv->nmp[2];
     bool gamma_extrapolation = PARAM.inp.exx_gamma_extrapolation;
-    if (!p_kv->get_is_mp())
+    bool is_mp = p_kv->get_is_mp();
+#ifdef __MPI
+    Parallel_Common::bcast_bool(is_mp);
+#endif
+    if (!is_mp)
     {
         gamma_extrapolation = false;
     }
-    auto isint = [](double x)
-    {
-        double epsilon = 1e-6; // this follows the isint judgement in q-e
-        return std::abs(x - std::round(x)) < epsilon;
-    };
 
     // T is complex of FPTYPE, if FPTYPE is double, T is std::complex<double>
     // but if FPTYPE is std::complex<double>, T is still std::complex<double>
@@ -54,11 +51,11 @@ void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
     resmem_complex_op()(psi_mq_real, wfcpw->nrxx);
     resmem_complex_op()(density_real, rhopw->nrxx);
     resmem_complex_op()(density_recip, rhopw->npw);
-    resmem_real_op()(pot, rhopw->npw * nks * nks);
-    resmem_real_op()(pot_stress, rhopw->npw * nks * nks);
+    resmem_real_op()(pot, rhopw->npw);
+    resmem_real_op()(pot_stress, rhopw->npw);
 
-    hamilt::get_exx_potential<Real, Device>(p_kv, wfcpw, rhopw, pot, tpiba, gamma_extrapolation, omega);
-    hamilt::get_exx_stress_potential<Real, Device>(p_kv, wfcpw, rhopw, pot_stress, tpiba, gamma_extrapolation, omega);
+    // hamilt::get_exx_potential<Real, Device>(p_kv, wfcpw, rhopw, pot, tpiba, gamma_extrapolation, omega);
+    // hamilt::get_exx_stress_potential<Real, Device>(p_kv, wfcpw, rhopw, pot_stress, tpiba, gamma_extrapolation, omega);
 
     // calculate the stress
 
@@ -75,6 +72,8 @@ void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
 
             for (int iq = 0; iq < nqs; iq++)
             {
+                hamilt::get_exx_potential<Real, Device>(p_kv, wfcpw, rhopw, pot, tpiba, gamma_extrapolation, omega, ik, iq, true);
+                hamilt::get_exx_stress_potential<Real, Device>(p_kv, wfcpw, rhopw, pot_stress, tpiba, gamma_extrapolation, omega, ik, iq);
                 for (int mband = 0; mband < d_psi_in->get_nbands(); mband++)
                 {
                     // psi_mq in real space
@@ -111,7 +110,7 @@ void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
                                 double kqg_beta = kqg[beta] * tpiba;
                                 // equation 10 of 10.1103/PhysRevB.73.125120
                                 double density_recip2 = std::real(density_recip[ig] * std::conj(density_recip[ig]));
-                                const int idx = ig + iq * rhopw->npw + ik * rhopw->npw * nqs;
+                                const int idx = ig;
                                 double pot_local = pot[idx];
                                 double pot_stress_local = pot_stress[idx];
                                 sigma_ab_loc += density_recip2 * pot_local * (kqg_alpha * kqg_beta * pot_stress_local - delta_ab) ;