Merge pull request #3225 from alibaba/feature/sync

MNN:Sync: Sync Internal 3.0.5
alibaba · Feb 12, 2025 · aee17fa · aee17fa
2 parents 32a8e0d + 3b6ddc0
commit aee17fa
Show file tree

Hide file tree

Showing 118 changed files with 6,249 additions and 3,839 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -187,9 +187,6 @@ endif()
 if(MNN_SUPPORT_TRANSFORMER_FUSE)
     add_definitions(-DMNN_SUPPORT_TRANSFORMER_FUSE)
 endif()
-if(MNN_BUILD_AUDIO)
-    add_definitions(-DMNN_BUILD_AUDIO)
-endif()
 # debug options
 if(MNN_DEBUG_MEMORY)
     add_definitions(-DMNN_DEBUG_MEMORY)
@@ -216,6 +213,8 @@ option(MNN_TENSORRT "Enable TensorRT" OFF)
 option(MNN_COREML "Enable CoreML" OFF)
 option(MNN_NNAPI "Enable NNAPI" OFF)
 
+option(MNN_GPU_TIME_PROFILE "Enable time profiling for the OpenCL backend and Vulkan backend." OFF)
+
 option(MNN_CUDA_PROFILE "Enable CUDA profile" OFF)
 
 if (NOT MNN_CUDA OR NOT CMAKE_SYSTEM_NAME MATCHES "^Linux")
@@ -470,6 +469,11 @@ IF(MNN_BUILD_LLM)
   list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/llm.hpp)
 ENDIF()
 
+IF(MNN_BUILD_DIFFUSION)
+  file(GLOB MNN_DIFFUSION_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/diffusion/engine/include/diffusion/*)
+  list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/diffusion/engine/include/diffusion/diffusion.hpp)
+ENDIF()
+
 
 
 # Add Thread dependency
@@ -921,6 +925,15 @@ ELSE()
     ENDFOREACH()
   ENDIF()
 
+  IF(MNN_BUILD_DIFFUSION)
+    if (NOT MNN_AAPL_FMWK)
+        INSTALL(FILES ${MNN_DIFFUSION_HDRS} DESTINATION include/MNN/diffusion)
+    endif()
+    FOREACH(HDR ${MNN_DIFFUSION_HDRS})
+      SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/diffusion )
+    ENDFOREACH()
+  ENDIF()
+
   if (NOT MNN_AAPL_FMWK)
       INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
       INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)

diff --git a/README.md b/README.md
@@ -165,3 +165,4 @@ MNN refers to the following projects:
 - [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
 - [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
 - [opencv](https://github.com/opencv/opencv)
+- [onnxruntime](https://github.com/microsoft/onnxruntime)
diff --git a/README_CN.md b/README_CN.md
@@ -155,4 +155,5 @@ MNN参考、借鉴了下列项目：
 - [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
 - [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
 - [opencv](https://github.com/opencv/opencv)
+- [onnxruntime](https://github.com/microsoft/onnxruntime)
 
diff --git a/README_JP.md b/README_JP.md
@@ -163,3 +163,4 @@ MNNは以下のプロジェクトを参照しています：
 - [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
 - [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
 - [opencv](https://github.com/opencv/opencv)
+- [onnxruntime](https://github.com/microsoft/onnxruntime)
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -59,7 +59,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_SSE_USE_FP16_INSTEAD | 在X86平台是否使用`FP16`替代`BF16`，默认为`OFF` |
 | MNN_AVX512_VNNI      | 是否使用`avx512_vnni`指令，该宏仅在`MNN_AVX512=ON`时生效，默认为`OFF` |
 | MNN_OPENCL_SIZE_CUT  | 是否为了降低OpenCL大小而关闭OpenCL Buffer实现，该宏仅在`MNN_OPENCL=ON`时生效，默认为`OFF` |
-| MNN_OPENCL_PROFILE   | 是否打开OpenCL Kernel性能Profile，该宏仅在`MNN_OPENCL=ON`时生效，默认为`OFF` |
+| MNN_GPU_TIME_PROFILE | 是否打开OpenCL后端及Vulkan后端的Kernel性能Profile，该宏仅在`MNN_OPENCL=ON`或`MNN_VULKAN=ON`时生效，默认为`OFF` |
 | MNN_METALLIB_SOURCE  | 使用Metal时是否直接使用Metal源码，该宏仅在`MNN_METAL=ON`时生效，默认为`ON` |
 | MNN_VULKAN_DEBUG     | 是否打开Vulkan的DEBUG模式，该宏仅在`MNN_VULKAN=ON`时生效，默认为`OFF` |
 | MNN_OPENGL_REGEN     | 是否重新生成OpenGL Kenel，该宏仅在`MNN_OPENGL=ON`时生效，默认为`OFF` |

diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
@@ -49,13 +49,13 @@ python3 convert_mnn.py onnx_path mnn_save_path "--weightQuantBits=8 --transforme
 cd mnn_path
 mkdir build
 cd build
-cmake .. -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+cmake .. -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
 make -j32
 ```
 ### Android上
 ```
 cd mnn_path/project/android/build
-../build_64.sh -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+../build_64.sh -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
 ../updateTest.sh
 ```
 ## 运行Diffusion Demo
@@ -90,6 +90,6 @@ cd mnn_path/project/android/build
 ```
 ## FAQ
 1. Demo运行报错、段错误，怎么解决？
-- 常见错误可能是设备内存不足，通常支持opencl fp16的设备需要保证3GB以上的内存，不支持fp16则需要6GB以上显存了。
+- 常见错误可能是设备内存不足，通常支持opencl fp16的设备需要保证2GB以上的内存，不支持fp16则需要4GB以上显存了。
 2. 使用其他后端，出现报错，什么原因？
 - 目前其他后端暂不支持transformer插件算子，需要在onnx->mnn模型转换阶段，去掉--transformerFuse。
diff --git a/express/MathOp.cpp b/express/MathOp.cpp
@@ -1320,20 +1320,5 @@ VARP _Histogram(VARP x, int bin, int min, int max, int channel) {
     return (Variable::create(Expr::create(std::move(op), {x})));
 }
 
-#ifdef MNN_BUILD_AUDIO
-VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abs) {
-    std::unique_ptr<OpT> op(new OpT);
-    op->type      = OpType_Stft;
-    op->main.type = OpParameter_StftParam;
-    auto param = new StftParamT;
-    param->n_fft = n_fft;
-    param->hop_length = hop_length;
-    param->abs = abs;
-    op->main.value = param;
-    EXPRP expr = Expr::create(std::move(op), {sample, window});
-    return Variable::create(expr);
-}
-#endif
-
 } // namespace Express
 } // namespace MNN
diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp
@@ -1902,8 +1902,12 @@ VARP _Col2Im(VARP x, VARP outputShape, INTS kernelSize, INTS dilate, INTS pads,
     auto common    = new Convolution2DCommonT;
     param->common.reset(common);
     op->main.value = param;
-    common->padX        = pads[0];
-    common->padY        = pads[1];
+    if (pads.size() == 4) {
+        common->pads = pads;
+    } else {
+        common->padX        = pads[0];
+        common->padY        = pads[1];
+    }
     common->strideX     = stride[0];
     common->strideY     = stride[1];
     common->dilateX     = dilate[0];

diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
@@ -713,6 +713,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
 
 Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, std::shared_ptr<BufferStorage> bufferStorage, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config, std::map<std::string, SubGraph>& subGraphMap) {
     MNN_ASSERT(nullptr != rtMgr);
+    MNN_ASSERT(nullptr != config);
     std::shared_ptr<Schedule::ScheduleInfo> sharedConst;
     auto buffer = bufferStorage->buffer();
     auto length = bufferStorage->size();
@@ -721,12 +722,14 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
     // Extra Const Tensors
     sharedConst.reset(new Schedule::ScheduleInfo);
     auto curExe = ExecutorScope::Current();
-    bool permitCodeGen = false;
+    bool preReplaceConstTensor = true;
     std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
     if (!rtMgr->getInside()->mContent->mExternalFile.empty()) {
         modRuntimeCfgPtr->externalFile = rtMgr->getInside()->mContent->mExternalFile;
     }
-    permitCodeGen = rtMgr->getInside()->mContent->modes.codegenMode == Interpreter::Session_Codegen_Enable;
+    if (rtMgr->getInside()->mContent->modes.codegenMode == Interpreter::Session_Codegen_Enable || (!config->shapeMutable)) {
+        preReplaceConstTensor = false;
+    }
     std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
     std::vector<std::shared_ptr<Tensor>> allTensors;
     sharedConst->allTensors.resize(net->tensorName()->size());
@@ -795,7 +798,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
     for (int i=0; i<subModulesInfo.size(); ++i) {
         subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
     }
-    if (!permitCodeGen) {
+    if (preReplaceConstTensor) {
         // Prereplace const tensor
         auto curBackend = sharedConst->constReplaceBackend.get();
         if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {

diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -76,6 +76,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 3
 #define MNN_VERSION_MINOR 0
-#define MNN_VERSION_PATCH 4
+#define MNN_VERSION_PATCH 5
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/expr/MathOp.hpp b/include/MNN/expr/MathOp.hpp
@@ -138,9 +138,6 @@ MNN_PUBLIC VARP _CumSum(VARP x, int axis, bool exclusive = false, bool reverse =
 MNN_PUBLIC VARP _CumProd(VARP x, int axis);
 MNN_PUBLIC VARPS _Svd(VARP x);
 MNN_PUBLIC VARP _Histogram(VARP x, int bin, int min, int max, int channel = -1);
-#ifdef MNN_BUILD_AUDIO
-MNN_PUBLIC VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abse = true);
-#endif
 }; // namespace Express
 }; // namespace MNN
 

diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
@@ -25,6 +25,7 @@ using Vec = MNN::Math::Vec<FLOAT16, 8>;
 extern "C" {
 // (UP_DIV(l,8), e, 8) -> (UP_DIV(e,eP), l, eP)
 void Arm82MNNPackForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
+// void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth, int32_t* areaOffset);
 
 // C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, eP), hP = 24
 // parameter: [aStride, l, h, cStride, bExtraStride]
@@ -372,14 +373,44 @@ void MNNUnpackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, si
     int c      = (int)depth;
     int cDiv4  = c / 8;
     int cAlign = cDiv4 * 8;
+    int areaDiv4 = area / 4;
+    int areaAlign = areaDiv4 * 4;
 
-    for (int hi = 0; hi < area; ++hi) {
-        auto srcHeight = src + hi * 8;
-        auto dstHeight = dst + hi * c;
+    if (areaAlign > 0) {
         for (int ci = 0; ci < cDiv4; ++ci) {
-            vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * srcAreaOffset));
+            auto srcH = src + ci * 8 * srcAreaOffset;
+            auto dstH = dst + ci * 8;
+            for (int hi = 0; hi < areaAlign; hi+=4) {
+                auto src0 = srcH + hi * 8;
+                auto src1 = srcH + hi * 8 + 8;
+                auto src2 = srcH + hi * 8 + 16;
+                auto src3 = srcH + hi * 8 + 24;
+
+                auto dst0 = dstH + hi * c;
+                auto dst1 = dstH + hi * c + c;
+                auto dst2 = dstH + hi * c + 2 * c;
+                auto dst3 = dstH + hi * c + 3 * c;
+                vst1q_s16(dst0, vld1q_s16(src0));
+                vst1q_s16(dst1, vld1q_s16(src1));
+                vst1q_s16(dst2, vld1q_s16(src2));
+                vst1q_s16(dst3, vld1q_s16(src3));
+            }
         }
     }
+    if (areaAlign < area) {
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            auto srcH = src + 8 * ci * srcAreaOffset;
+            auto dstH = dst + ci * 8;
+            for (int hi = areaAlign; hi < area; ++hi) {
+                auto src0 = srcH + hi * 8;
+                auto dst0 = dstH + hi * c;
+                vst1q_s16(dst0, vld1q_s16(src0));
+            }
+        }
+    }
+    if (c == cAlign) {
+        return;
+    }
 
     int cReamin   = c - cAlign;
     auto srcAlign = src + srcAreaOffset * cAlign;
@@ -404,11 +435,37 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
     int c      = (int)depth;
     int cDiv4  = c / 8;
     int cAlign = cDiv4 * 8;
-    for (int hi = 0; hi < area; ++hi) {
-        auto srcHeight = (src + hi * c);
-        auto dstHeight = (dst + hi * 8);
+    int areaDiv4 = area / 4;
+    int areaAlign = areaDiv4 * 4;
+    if (areaAlign > 0) {
         for (int ci = 0; ci < cDiv4; ++ci) {
-            vst1q_s16(dstHeight + ci * dstAreaOffset * 8, vld1q_s16(srcHeight + 8 * ci));
+            auto srcH = src + ci * 8;
+            auto dstH = dst + ci * dstAreaOffset * 8;
+            for (int hi = 0; hi < areaAlign; hi+=4) {
+                auto src0 = srcH + hi * c;
+                auto src1 = srcH + hi * c + c;
+                auto src2 = srcH + hi * c + 2 * c;
+                auto src3 = srcH + hi * c + 3 * c;
+                auto dst0 = dstH + hi * 8;
+                auto dst1 = dstH + hi * 8 + 8;
+                auto dst2 = dstH + hi * 8 + 16;
+                auto dst3 = dstH + hi * 8 + 24;
+                vst1q_s16(dst0, vld1q_s16(src0));
+                vst1q_s16(dst1, vld1q_s16(src1));
+                vst1q_s16(dst2, vld1q_s16(src2));
+                vst1q_s16(dst3, vld1q_s16(src3));
+            }
+        }
+    }
+    if (areaAlign < area) {
+        for (int ci = 0; ci < cDiv4; ++ci) {
+            auto srcH = src + ci * 8;
+            auto dstH = dst + ci * dstAreaOffset * 8;
+            for (int hi = areaAlign; hi < area; ++hi) {
+                auto src0 = srcH + hi * c;
+                auto dst0 = dstH + hi * 8;
+                vst1q_s16(dst0, vld1q_s16(src0));
+            }
         }
     }