Skip to content

Commit

Permalink
Merge pull request #3225 from alibaba/feature/sync
Browse files Browse the repository at this point in the history
MNN:Sync: Sync Internal 3.0.5
  • Loading branch information
jxt1234 authored Feb 12, 2025
2 parents 32a8e0d + 3b6ddc0 commit aee17fa
Show file tree
Hide file tree
Showing 118 changed files with 6,249 additions and 3,839 deletions.
19 changes: 16 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,6 @@ endif()
if(MNN_SUPPORT_TRANSFORMER_FUSE)
add_definitions(-DMNN_SUPPORT_TRANSFORMER_FUSE)
endif()
if(MNN_BUILD_AUDIO)
add_definitions(-DMNN_BUILD_AUDIO)
endif()
# debug options
if(MNN_DEBUG_MEMORY)
add_definitions(-DMNN_DEBUG_MEMORY)
Expand All @@ -216,6 +213,8 @@ option(MNN_TENSORRT "Enable TensorRT" OFF)
option(MNN_COREML "Enable CoreML" OFF)
option(MNN_NNAPI "Enable NNAPI" OFF)

option(MNN_GPU_TIME_PROFILE "Enable time profiling for the OpenCL backend and Vulkan backend." OFF)

option(MNN_CUDA_PROFILE "Enable CUDA profile" OFF)

if (NOT MNN_CUDA OR NOT CMAKE_SYSTEM_NAME MATCHES "^Linux")
Expand Down Expand Up @@ -470,6 +469,11 @@ IF(MNN_BUILD_LLM)
list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/llm/engine/include/llm/llm.hpp)
ENDIF()

IF(MNN_BUILD_DIFFUSION)
file(GLOB MNN_DIFFUSION_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/diffusion/engine/include/diffusion/*)
list(APPEND MNN_EXTRA_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/transformers/diffusion/engine/include/diffusion/diffusion.hpp)
ENDIF()



# Add Thread dependency
Expand Down Expand Up @@ -921,6 +925,15 @@ ELSE()
ENDFOREACH()
ENDIF()

IF(MNN_BUILD_DIFFUSION)
if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_DIFFUSION_HDRS} DESTINATION include/MNN/diffusion)
endif()
FOREACH(HDR ${MNN_DIFFUSION_HDRS})
SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/diffusion )
ENDFOREACH()
ENDIF()

if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,4 @@ MNN refers to the following projects:
- [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
- [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
- [opencv](https://github.com/opencv/opencv)
- [onnxruntime](https://github.com/microsoft/onnxruntime)
1 change: 1 addition & 0 deletions README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,5 @@ MNN参考、借鉴了下列项目:
- [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
- [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
- [opencv](https://github.com/opencv/opencv)
- [onnxruntime](https://github.com/microsoft/onnxruntime)

1 change: 1 addition & 0 deletions README_JP.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,4 @@ MNNは以下のプロジェクトを参照しています:
- [libyuv](https://chromium.googlesource.com/libyuv/libyuv)
- [libjpeg](https://github.com/libjpeg-turbo/libjpeg-turbo)
- [opencv](https://github.com/opencv/opencv)
- [onnxruntime](https://github.com/microsoft/onnxruntime)
2 changes: 1 addition & 1 deletion docs/compile/cmake.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
| MNN_SSE_USE_FP16_INSTEAD | 在X86平台是否使用`FP16`替代`BF16`,默认为`OFF` |
| MNN_AVX512_VNNI | 是否使用`avx512_vnni`指令,该宏仅在`MNN_AVX512=ON`时生效,默认为`OFF` |
| MNN_OPENCL_SIZE_CUT | 是否为了降低OpenCL大小而关闭OpenCL Buffer实现,该宏仅在`MNN_OPENCL=ON`时生效,默认为`OFF` |
| MNN_OPENCL_PROFILE | 是否打开OpenCL Kernel性能Profile,该宏仅在`MNN_OPENCL=ON`时生效,默认为`OFF` |
| MNN_GPU_TIME_PROFILE | 是否打开OpenCL后端及Vulkan后端的Kernel性能Profile,该宏仅在`MNN_OPENCL=ON``MNN_VULKAN=ON`时生效,默认为`OFF` |
| MNN_METALLIB_SOURCE | 使用Metal时是否直接使用Metal源码,该宏仅在`MNN_METAL=ON`时生效,默认为`ON` |
| MNN_VULKAN_DEBUG | 是否打开Vulkan的DEBUG模式,该宏仅在`MNN_VULKAN=ON`时生效,默认为`OFF` |
| MNN_OPENGL_REGEN | 是否重新生成OpenGL Kenel,该宏仅在`MNN_OPENGL=ON`时生效,默认为`OFF` |
Expand Down
6 changes: 3 additions & 3 deletions docs/transformers/diffusion.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ python3 convert_mnn.py onnx_path mnn_save_path "--weightQuantBits=8 --transforme
cd mnn_path
mkdir build
cd build
cmake .. -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
cmake .. -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
make -j32
```
### Android上
```
cd mnn_path/project/android/build
../build_64.sh -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
../build_64.sh -DMNN_LOW_MEMORY=ON -DMNN_BUILD_DIFFUSION=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_OPENCL=ON -DMNN_SEP_BUILD=OFF -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
../updateTest.sh
```
## 运行Diffusion Demo
Expand Down Expand Up @@ -90,6 +90,6 @@ cd mnn_path/project/android/build
```
## FAQ
1. Demo运行报错、段错误,怎么解决?
- 常见错误可能是设备内存不足,通常支持opencl fp16的设备需要保证3GB以上的内存,不支持fp16则需要6GB以上显存了
- 常见错误可能是设备内存不足,通常支持opencl fp16的设备需要保证2GB以上的内存,不支持fp16则需要4GB以上显存了
2. 使用其他后端,出现报错,什么原因?
- 目前其他后端暂不支持transformer插件算子,需要在onnx->mnn模型转换阶段,去掉--transformerFuse。
15 changes: 0 additions & 15 deletions express/MathOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1320,20 +1320,5 @@ VARP _Histogram(VARP x, int bin, int min, int max, int channel) {
return (Variable::create(Expr::create(std::move(op), {x})));
}

#ifdef MNN_BUILD_AUDIO
VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abs) {
std::unique_ptr<OpT> op(new OpT);
op->type = OpType_Stft;
op->main.type = OpParameter_StftParam;
auto param = new StftParamT;
param->n_fft = n_fft;
param->hop_length = hop_length;
param->abs = abs;
op->main.value = param;
EXPRP expr = Expr::create(std::move(op), {sample, window});
return Variable::create(expr);
}
#endif

} // namespace Express
} // namespace MNN
8 changes: 6 additions & 2 deletions express/NeuralNetWorkOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1902,8 +1902,12 @@ VARP _Col2Im(VARP x, VARP outputShape, INTS kernelSize, INTS dilate, INTS pads,
auto common = new Convolution2DCommonT;
param->common.reset(common);
op->main.value = param;
common->padX = pads[0];
common->padY = pads[1];
if (pads.size() == 4) {
common->pads = pads;
} else {
common->padX = pads[0];
common->padY = pads[1];
}
common->strideX = stride[0];
common->strideY = stride[1];
common->dilateX = dilate[0];
Expand Down
9 changes: 6 additions & 3 deletions express/module/PipelineModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::

Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, std::shared_ptr<BufferStorage> bufferStorage, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config, std::map<std::string, SubGraph>& subGraphMap) {
MNN_ASSERT(nullptr != rtMgr);
MNN_ASSERT(nullptr != config);
std::shared_ptr<Schedule::ScheduleInfo> sharedConst;
auto buffer = bufferStorage->buffer();
auto length = bufferStorage->size();
Expand All @@ -721,12 +722,14 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
// Extra Const Tensors
sharedConst.reset(new Schedule::ScheduleInfo);
auto curExe = ExecutorScope::Current();
bool permitCodeGen = false;
bool preReplaceConstTensor = true;
std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
if (!rtMgr->getInside()->mContent->mExternalFile.empty()) {
modRuntimeCfgPtr->externalFile = rtMgr->getInside()->mContent->mExternalFile;
}
permitCodeGen = rtMgr->getInside()->mContent->modes.codegenMode == Interpreter::Session_Codegen_Enable;
if (rtMgr->getInside()->mContent->modes.codegenMode == Interpreter::Session_Codegen_Enable || (!config->shapeMutable)) {
preReplaceConstTensor = false;
}
std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
std::vector<std::shared_ptr<Tensor>> allTensors;
sharedConst->allTensors.resize(net->tensorName()->size());
Expand Down Expand Up @@ -795,7 +798,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
for (int i=0; i<subModulesInfo.size(); ++i) {
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
}
if (!permitCodeGen) {
if (preReplaceConstTensor) {
// Prereplace const tensor
auto curBackend = sharedConst->constReplaceBackend.get();
if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
Expand Down
2 changes: 1 addition & 1 deletion include/MNN/MNNDefine.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 3
#define MNN_VERSION_MINOR 0
#define MNN_VERSION_PATCH 4
#define MNN_VERSION_PATCH 5
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */
3 changes: 0 additions & 3 deletions include/MNN/expr/MathOp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,6 @@ MNN_PUBLIC VARP _CumSum(VARP x, int axis, bool exclusive = false, bool reverse =
MNN_PUBLIC VARP _CumProd(VARP x, int axis);
MNN_PUBLIC VARPS _Svd(VARP x);
MNN_PUBLIC VARP _Histogram(VARP x, int bin, int min, int max, int channel = -1);
#ifdef MNN_BUILD_AUDIO
MNN_PUBLIC VARP _Stft(VARP sample, VARP window, int n_fft, int hop_length, bool abse = true);
#endif
}; // namespace Express
}; // namespace MNN

Expand Down
73 changes: 65 additions & 8 deletions source/backend/arm82/Arm82Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ using Vec = MNN::Math::Vec<FLOAT16, 8>;
extern "C" {
// (UP_DIV(l,8), e, 8) -> (UP_DIV(e,eP), l, eP)
void Arm82MNNPackForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
// void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size_t depth, int32_t* areaOffset);

// C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, eP), hP = 24
// parameter: [aStride, l, h, cStride, bExtraStride]
Expand Down Expand Up @@ -372,14 +373,44 @@ void MNNUnpackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, si
int c = (int)depth;
int cDiv4 = c / 8;
int cAlign = cDiv4 * 8;
int areaDiv4 = area / 4;
int areaAlign = areaDiv4 * 4;

for (int hi = 0; hi < area; ++hi) {
auto srcHeight = src + hi * 8;
auto dstHeight = dst + hi * c;
if (areaAlign > 0) {
for (int ci = 0; ci < cDiv4; ++ci) {
vst1q_s16(dstHeight + ci * 8, vld1q_s16(srcHeight + 8 * ci * srcAreaOffset));
auto srcH = src + ci * 8 * srcAreaOffset;
auto dstH = dst + ci * 8;
for (int hi = 0; hi < areaAlign; hi+=4) {
auto src0 = srcH + hi * 8;
auto src1 = srcH + hi * 8 + 8;
auto src2 = srcH + hi * 8 + 16;
auto src3 = srcH + hi * 8 + 24;

auto dst0 = dstH + hi * c;
auto dst1 = dstH + hi * c + c;
auto dst2 = dstH + hi * c + 2 * c;
auto dst3 = dstH + hi * c + 3 * c;
vst1q_s16(dst0, vld1q_s16(src0));
vst1q_s16(dst1, vld1q_s16(src1));
vst1q_s16(dst2, vld1q_s16(src2));
vst1q_s16(dst3, vld1q_s16(src3));
}
}
}
if (areaAlign < area) {
for (int ci = 0; ci < cDiv4; ++ci) {
auto srcH = src + 8 * ci * srcAreaOffset;
auto dstH = dst + ci * 8;
for (int hi = areaAlign; hi < area; ++hi) {
auto src0 = srcH + hi * 8;
auto dst0 = dstH + hi * c;
vst1q_s16(dst0, vld1q_s16(src0));
}
}
}
if (c == cAlign) {
return;
}

int cReamin = c - cAlign;
auto srcAlign = src + srcAreaOffset * cAlign;
Expand All @@ -404,11 +435,37 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
int c = (int)depth;
int cDiv4 = c / 8;
int cAlign = cDiv4 * 8;
for (int hi = 0; hi < area; ++hi) {
auto srcHeight = (src + hi * c);
auto dstHeight = (dst + hi * 8);
int areaDiv4 = area / 4;
int areaAlign = areaDiv4 * 4;
if (areaAlign > 0) {
for (int ci = 0; ci < cDiv4; ++ci) {
vst1q_s16(dstHeight + ci * dstAreaOffset * 8, vld1q_s16(srcHeight + 8 * ci));
auto srcH = src + ci * 8;
auto dstH = dst + ci * dstAreaOffset * 8;
for (int hi = 0; hi < areaAlign; hi+=4) {
auto src0 = srcH + hi * c;
auto src1 = srcH + hi * c + c;
auto src2 = srcH + hi * c + 2 * c;
auto src3 = srcH + hi * c + 3 * c;
auto dst0 = dstH + hi * 8;
auto dst1 = dstH + hi * 8 + 8;
auto dst2 = dstH + hi * 8 + 16;
auto dst3 = dstH + hi * 8 + 24;
vst1q_s16(dst0, vld1q_s16(src0));
vst1q_s16(dst1, vld1q_s16(src1));
vst1q_s16(dst2, vld1q_s16(src2));
vst1q_s16(dst3, vld1q_s16(src3));
}
}
}
if (areaAlign < area) {
for (int ci = 0; ci < cDiv4; ++ci) {
auto srcH = src + ci * 8;
auto dstH = dst + ci * dstAreaOffset * 8;
for (int hi = areaAlign; hi < area; ++hi) {
auto src0 = srcH + hi * c;
auto dst0 = dstH + hi * 8;
vst1q_s16(dst0, vld1q_s16(src0));
}
}
}

Expand Down
Loading

0 comments on commit aee17fa

Please sign in to comment.