PaddlePaddle · enkilee · Jan 26, 2025
@@ -84,16 +84,6 @@ registe = 'registe'
 REGIST = 'REGIST'
 Regiter = 'Regiter'
 setted = 'setted'
-Skiped = 'Skiped'
-skiped = 'skiped'
-smll = 'smll'
-samll = 'samll'
-somme = 'somme'
-patial = 'patial'
-Patial = 'Patial'
-specificed = 'specificed'
-splite = 'splite'
-spliter = 'spliter'
 spliting = 'spliting'
 Spliting = 'Spliting'
 splited = 'splited'

diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -38,7 +38,7 @@ std::set<std::string> OpsHandledInStaticBuild = {"conditional_block",
                                                  "read",
                                                  "while"};
 
-std::set<std::string> OpsCanSkipedFakeAllocInStaticBuild = {
+std::set<std::string> OpsCanSkippedFakeAllocInStaticBuild = {
     "c_comm_init",
     "comm_init_all",
     "c_comm_init_multitrainer",
@@ -130,7 +130,7 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
   std::set<std::pair<std::string, KernelCode>> invalid_ops;
   for (auto& op : block.AllOps()) {
     auto op_type = op->Type();
-    if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type) ||
+    if (OpsCanSkippedFakeAllocInStaticBuild.count(op_type) ||
         OpsHandledInStaticBuild.count(op_type)) {
       continue;
     }
@@ -194,7 +194,7 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op,
                                    const std::string& parameter_name,
                                    const phi::TensorBase* tensor) {
   const std::string& op_type = op.Type();
-  if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
+  if (OpsCanSkippedFakeAllocInStaticBuild.count(op_type)) {
     return false;
   }
 
@@ -644,7 +644,7 @@ void FakeInitializeOutputsForOperatorBase(
     Scope* scope,
     std::vector<std::shared_ptr<OperatorBase>> following_ops) {
   const std::string& op_type = op.Type();
-  if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
+  if (OpsCanSkippedFakeAllocInStaticBuild.count(op_type)) {
     return;
   }
 
@@ -937,7 +937,7 @@ void FakeInitializeOutputsForStructureKernel(
     const framework::OpKernelType& op_kernel_type,
     ExecutionContext* execution_context) {
   const framework::OperatorBase& op = execution_context->GetOp();
-  if (OpsCanSkipedFakeAllocInStaticBuild.count(op.Type())) {
+  if (OpsCanSkippedFakeAllocInStaticBuild.count(op.Type())) {
     return;
   }
 

diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
@@ -143,7 +143,7 @@ typedef struct PD_PString_Small {  // NOLINT
 
 typedef struct PD_PString {  // NOLINT
   union {
-    PD_PString_Small smll;
+    PD_PString_Small small;
     PD_PString_Large large;
     PD_PString_Offset offset;
     PD_PString_View view;
@@ -275,7 +275,7 @@ HOSTDEVICE static inline void PD_PString_Dealloc(PD_PString *str) {
 HOSTDEVICE static inline size_t PD_PString_GetSize(const PD_PString *str) {
   switch (PD_PString_GetType(str)) {
     case PD_PSTR_SMALL:
-      return str->u.smll.size >> 2;
+      return str->u.small.size >> 2;
     case PD_PSTR_LARGE:
       return PD_PString_ToActualSizeT(str->u.large.size);
     case PD_PSTR_OFFSET:
@@ -304,7 +304,7 @@ HOSTDEVICE static inline const char *PD_PString_GetDataPointer(
     const PD_PString *str) {
   switch (PD_PString_GetType(str)) {
     case PD_PSTR_SMALL:
-      return str->u.smll.str;
+      return str->u.small.str;
     case PD_PSTR_LARGE:
       return str->u.large.ptr;
     case PD_PSTR_OFFSET:
@@ -327,18 +327,18 @@ HOSTDEVICE static inline char *PD_PString_ResizeUninitialized(PD_PString *str,
 
   // Case: SMALL/LARGE/VIEW/OFFSET -> SMALL
   if (new_size <= PD_PString_SmallCapacity) {
-    str->u.smll.size = (uint8_t)((new_size << 2) | PD_PSTR_SMALL);  // NOLINT
-    str->u.smll.str[new_size] = '\0';
+    str->u.small.size = (uint8_t)((new_size << 2) | PD_PSTR_SMALL);  // NOLINT
+    str->u.small.str[new_size] = '\0';
 
     if (curr_type != PD_PSTR_SMALL && copy_size) {
-      PD_Memcpy(str->u.smll.str, curr_ptr, copy_size);
+      PD_Memcpy(str->u.small.str, curr_ptr, copy_size);
     }
 
     if (curr_type == PD_PSTR_LARGE) {
       PD_Free((void *)curr_ptr, str->u.large.cap + 1);  // NOLINT
     }
 
-    return str->u.smll.str;
+    return str->u.small.str;
   }
 
   // Case: SMALL/LARGE/VIEW/OFFSET -> LARGE
@@ -380,12 +380,12 @@ HOSTDEVICE static inline char *PD_PString_GetMutableDataPointer(
     PD_PString *str) {
   switch (PD_PString_GetType(str)) {
     case PD_PSTR_SMALL:
-      return str->u.smll.str;
+      return str->u.small.str;
     case PD_PSTR_OFFSET:
     case PD_PSTR_VIEW:
       // Convert OFFSET/VIEW to SMALL/LARGE
       PD_PString_ResizeUninitialized(str, PD_PString_GetSize(str));
-      return (PD_PString_GetType(str) == PD_PSTR_SMALL) ? str->u.smll.str
+      return (PD_PString_GetType(str) == PD_PSTR_SMALL) ? str->u.small.str
                                                         : str->u.large.ptr;
     case PD_PSTR_LARGE:
       return str->u.large.ptr;

diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
@@ -498,7 +498,7 @@ struct ReduceConfig {
   }
 
   // Set block and grid for launch kernel
-  // for ReduceHigherDim: if block is enough -> splite reduce_num
+  // for ReduceHigherDim: if block is enough -> split reduce_num
   //                     else init block(32, 1) grid(block_num, 1)
   // for others: block(block_num, 1) , grid(left_num, 1)
   void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {

diff --git a/.../kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h b/.../kernels/fusion/cutlass/memory_efficient_attention/gemm/attention_scaling_coefs_updater.h
@@ -262,13 +262,13 @@ struct AttentionScalingCoefsUpdaterVolta
   using Element = accum_t;
 
   static int const kElementsPerPartial = 4;
-  using EleShapePerPatial = typename cutlass::platform::conditional<
+  using EleShapePerPartial = typename cutlass::platform::conditional<
       cutlass::platform::is_same<Element, float>::value,
       cutlass::MatrixShape<2, 2>,
       cutlass::MatrixShape<1, 4>>::type;
   static int const kElementsPerMma = 8;
-  static int const kAccumulatorPatials = 2;
-  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+  static int const kAccumulatorPartials = 2;
+  using QuadShapePerPartialMma = cutlass::MatrixShape<4, 4>;
 
   static cutlass::MatrixCoord CUTLASS_DEVICE
   get_lane_offset(int8_t lane_id,
@@ -283,12 +283,13 @@ struct AttentionScalingCoefsUpdaterVolta
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
       // (quad[1])+lane_in_quad[1]
       accum_n =
-          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials +
           (lane_in_quad & 2);
     } else {
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
                 lane_in_quad;  // (quad[2],quad[0])
-      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials;
     }
     return cutlass::MatrixCoord(
         accum_m + tile_offset.row() * Shape::kRow,
@@ -322,9 +323,9 @@ struct AttentionScalingCoefsUpdaterVolta
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+        for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
           int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                        mma_m * QuadShapePerPatialMma::kRow + m * 2 +
+                        mma_m * QuadShapePerPartialMma::kRow + m * 2 +
                         lane_offset.row();
           beginRow(accum_m);
 
@@ -335,9 +336,9 @@ struct AttentionScalingCoefsUpdaterVolta
             for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
                  ++mma_n) {
               CUTLASS_PRAGMA_UNROLL
-              for (int p = 0; p < kAccumulatorPatials; ++p) {
+              for (int p = 0; p < kAccumulatorPartials; ++p) {
                 CUTLASS_PRAGMA_UNROLL
-                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                for (int n = 0; n < EleShapePerPartial::kColumn; ++n) {
                   int mma_accum_start =
                       (((tile_n * Policy::TileIterations::kRow + tile_m) *
                             Policy::MmaIterations::kColumn +
@@ -346,11 +347,11 @@ struct AttentionScalingCoefsUpdaterVolta
                        mma_m) *
                       kElementsPerMma;
                   int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                                mma_n * QuadShapePerPatialMma::kColumn +
+                                mma_n * QuadShapePerPartialMma::kColumn +
                                 p * Policy::InterleavedTile::kColumn / 2 + n +
                                 lane_offset.column();
                   int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPatial::kColumn + n;
+                            m * EleShapePerPartial::kColumn + n;
                   op(accum_m, accum_n, idx);
                 }
               }

diff --git a/...le/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h b/...le/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_accum_lambda_iterator.h
@@ -117,13 +117,13 @@ struct AccumLambdaIteratorSm70 {
   using Element = accum_t;
 
   static int const kElementsPerPartial = 4;
-  using EleShapePerPatial = typename cutlass::platform::conditional<
+  using EleShapePerPartial = typename cutlass::platform::conditional<
       cutlass::platform::is_same<Element, float>::value,
       cutlass::MatrixShape<2, 2>,
       cutlass::MatrixShape<1, 4>>::type;
   static int const kElementsPerMma = 8;
-  static int const kAccumulatorPatials = 2;
-  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+  static int const kAccumulatorPartials = 2;
+  using QuadShapePerPartialMma = cutlass::MatrixShape<4, 4>;
 
   static cutlass::MatrixCoord CUTLASS_DEVICE
   get_lane_offset(int8_t lane_id,
@@ -138,12 +138,13 @@ struct AccumLambdaIteratorSm70 {
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
       // (quad[1])+lane_in_quad[1]
       accum_n =
-          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials +
           (lane_in_quad & 2);
     } else {
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
                 lane_in_quad;  // (quad[2],quad[0])
-      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials;
     }
     return cutlass::MatrixCoord(
         accum_m + tile_offset.row() * Shape::kRow,
@@ -177,9 +178,9 @@ struct AccumLambdaIteratorSm70 {
       CUTLASS_PRAGMA_UNROLL
       for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
         CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+        for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
           int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                        mma_m * QuadShapePerPatialMma::kRow + m * 2 +
+                        mma_m * QuadShapePerPartialMma::kRow + m * 2 +
                         lane_offset.row();
           beginRow(accum_m);
 
@@ -190,9 +191,9 @@ struct AccumLambdaIteratorSm70 {
             for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
                  ++mma_n) {
               CUTLASS_PRAGMA_UNROLL
-              for (int p = 0; p < kAccumulatorPatials; ++p) {
+              for (int p = 0; p < kAccumulatorPartials; ++p) {
                 CUTLASS_PRAGMA_UNROLL
-                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                for (int n = 0; n < EleShapePerPartial::kColumn; ++n) {
                   int mma_accum_start =
                       (((tile_n * Policy::TileIterations::kRow + tile_m) *
                             Policy::MmaIterations::kColumn +
@@ -201,11 +202,11 @@ struct AccumLambdaIteratorSm70 {
                        mma_m) *
                       kElementsPerMma;
                   int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                                mma_n * QuadShapePerPatialMma::kColumn +
+                                mma_n * QuadShapePerPartialMma::kColumn +
                                 p * Policy::InterleavedTile::kColumn / 2 + n +
                                 lane_offset.column();
                   int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPatial::kColumn + n;
+                            m * EleShapePerPartial::kColumn + n;
                   op(accum_m, accum_n, idx);
                 }
               }

diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm/mma_from_smem.h
@@ -1749,13 +1749,13 @@ struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
   // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
   // Let's copy their values
   static int const kElementsPerPartial = 4;
-  using EleShapePerPatial = typename cutlass::platform::conditional<
+  using EleShapePerPartial = typename cutlass::platform::conditional<
       cutlass::platform::is_same<Element, float>::value,
       cutlass::MatrixShape<2, 2>,
       cutlass::MatrixShape<1, 4>>::type;
   static int const kElementsPerMma = 8;
-  static int const kAccumulatorPatials = 2;
-  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+  static int const kAccumulatorPartials = 2;
+  using QuadShapePerPartialMma = cutlass::MatrixShape<4, 4>;
 
   static void CUTLASS_DEVICE
   accumToSmem(AccumulatorSharedStorage& shared_storage,  // NOLINT
@@ -1773,12 +1773,13 @@ struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
       // (quad[1])+lane_in_quad[1]
       accum_n =
-          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials +
           (lane_in_quad & 2);
     } else {
       accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
                 lane_in_quad;  // (quad[2],quad[0])
-      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPartials;
     }
     cutlass::MatrixCoord lane_offset(accum_m, accum_n);
 
@@ -1787,7 +1788,7 @@ struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
                           cutlass::MatrixCoord({IteratorC::Shape::kRow,
                                                 IteratorC::Shape::kColumn}));
 
-    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPartial::kColumn>;
 
     // store - from MmaVoltaTensorOpAccumulatorTileIterator
     CUTLASS_PRAGMA_UNROLL
@@ -1807,20 +1808,20 @@ struct B2bGemm<cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
                 kElementsPerMma;
 
             CUTLASS_PRAGMA_UNROLL
-            for (int p = 0; p < kAccumulatorPatials; ++p) {
+            for (int p = 0; p < kAccumulatorPartials; ++p) {
               CUTLASS_PRAGMA_UNROLL
-              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+              for (int m = 0; m < EleShapePerPartial::kRow; ++m) {
                 int accum_m = tile_m * Policy::InterleavedTile::kRow +
-                              mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                              mma_m * QuadShapePerPartialMma::kRow + m * 2;
                 int accum_n = tile_n * Policy::InterleavedTile::kColumn +
-                              mma_n * QuadShapePerPatialMma::kColumn +
+                              mma_n * QuadShapePerPartialMma::kColumn +
                               p * Policy::InterleavedTile::kColumn / 2;
                 int r = (accum_m + lane_offset.row());
                 AccessType to_store;
                 CUTLASS_PRAGMA_UNROLL
-                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                for (int n = 0; n < EleShapePerPartial::kColumn; ++n) {
                   int idx = mma_accum_start + p * kElementsPerPartial +
-                            m * EleShapePerPatial::kColumn + n;
+                            m * EleShapePerPartial::kColumn + n;
                   int c = (accum_n + n + lane_offset.column());
                   to_store[n] = scalar_t(accum[idx]);
                 }

diff --git a/paddle/phi/kernels/impl/lstm_kernel_impl.h b/paddle/phi/kernels/impl/lstm_kernel_impl.h
@@ -143,7 +143,7 @@ void LSTMKernel(const Context& dev_ctx,
                   static_cast<T>(1.0));
     } else if (hidden_t0 != nullptr) {
       // If n == 0 and there is no initialized hidden state, that is to say
-      // the H0 is zeros, the calculation W_h * H0 will be skiped.
+      // the H0 is zeros, the calculation W_h * H0 will be skipped.
       // If n == 0 and there is initialized hidden state, calculate W_h * H0.
 
       // Since the batch computing for LSTM reorders the input sequence

diff --git a/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py b/python/paddle/distributed/auto_parallel/static/reshard_funcs/s_to_r_reshard_func.py
@@ -261,9 +261,9 @@ def reshard_s_to_r_with_padding(
                 allgather_op.result(0), num_of_process, 0
             )
             builtin_split_op = split_values[0].get_defining_op()
-            pd_splite_op = builtin_split_op.operand_source(0).get_defining_op()
-            pd_splite_op.dist_attr = copy_op_attr_with_new_member(
-                pd_splite_op.dist_attr, new_chunk_id=chunk_id
+            pd_split_op = builtin_split_op.operand_source(0).get_defining_op()
+            pd_split_op.dist_attr = copy_op_attr_with_new_member(
+                pd_split_op.dist_attr, new_chunk_id=chunk_id
             )
 
             # fix the split_with_num dist attribute.
@@ -277,7 +277,7 @@ def reshard_s_to_r_with_padding(
             vec_type = paddle.base.libpaddle.pir.create_vec_type(
                 new_inner_types
             )
-            pd_splite_op.result(0).set_type(vec_type)
+            pd_split_op.result(0).set_type(vec_type)
 
             if padding_num != 0:
                 tmp_split_values = paddle._C_ops.split(
@@ -309,7 +309,7 @@ def reshard_s_to_r_with_padding(
                 builtin_combine_op = concat_op.operand_source(
                     0
                 ).get_defining_op()
-                concat_op.operand(0).set_source(pd_splite_op.result(0))
+                concat_op.operand(0).set_source(pd_split_op.result(0))
                 builtin_combine_op.erase()
                 builtin_split_op.erase()
                 return concat_value