diff --git a/CMakeLists.txt b/CMakeLists.txt index 85abc8a4d3b4..c6906e832e39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,11 +80,11 @@ tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library gra # include directories include_directories(${CMAKE_INCLUDE_PATH}) include_directories("include") -include_directories(${DLPACK_PATH}) -include_directories(${DMLC_PATH}) -include_directories(${RANG_PATH}) -include_directories(${COMPILER_RT_PATH}) -include_directories(${PICOJSON_PATH}) +include_directories(SYSTEM ${DLPACK_PATH}) +include_directories(SYSTEM ${DMLC_PATH}) +include_directories(SYSTEM ${RANG_PATH}) +include_directories(SYSTEM ${COMPILER_RT_PATH}) +include_directories(SYSTEM ${PICOJSON_PATH}) # initial variables set(TVM_LINKER_LIBS "") @@ -123,14 +123,15 @@ if(MSVC) # DLL interface warning in c++ add_compile_options(/wd4251) else(MSVC) + set(WARNING_FLAG -Wall) if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") message(STATUS "Build in Debug mode") - set(CMAKE_C_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-O0 -g -Wall -fPIC ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "-O0 -g ${WARNING_FLAG} -fPIC ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "-O0 -g ${WARNING_FLAG} -fPIC ${CMAKE_CXX_FLAGS}") set(CMAKE_CUDA_FLAGS "-O0 -g -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}") else() - set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS "-O2 ${WARNING_FLAG} -fPIC ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "-O2 ${WARNING_FLAG} -fPIC ${CMAKE_CXX_FLAGS}") set(CMAKE_CUDA_FLAGS "-O2 -Xcompiler=-Wall -Xcompiler=-fPIC ${CMAKE_CUDA_FLAGS}") set(TVM_VISIBILITY_FLAG "") if (HIDE_PRIVATE_SYMBOLS) @@ -142,6 +143,7 @@ else(MSVC) CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}") endif() + include(cmake/modules/ClangFlags.cmake) # Detect if we're compiling for Hexagon. set(TEST_FOR_HEXAGON_CXX @@ -365,6 +367,13 @@ set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}") add_library(tvm_runtime SHARED $) set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}") +if(USE_MICRO) + # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the + # Unix Makefiles generator, need to add these explicit target-level dependency) + add_dependencies(tvm host_standalone_crt) + add_dependencies(tvm_runtime host_standalone_crt) +endif() + if(USE_CPP_RPC) add_subdirectory("apps/cpp_rpc") endif() @@ -390,7 +399,7 @@ if(BUILD_FOR_HEXAGON) set_target_properties(tvm_runtime PROPERTIES LINK_FLAGS "-Wl,--wrap=pthread_create") - target_include_directories(tvm_runtime + target_include_directories(tvm_runtime SYSTEM PUBLIC "${USE_HEXAGON_SDK}/libs/common/qurt/ADSPv62MP/include/posix" PUBLIC "${USE_HEXAGON_SDK}/libs/common/qurt/ADSPv62MP/include/qurt" PUBLIC "${USE_HEXAGON_SDK}/incs" @@ -453,7 +462,7 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB) string(REPLACE ".cc" "" __execname ${__srcname}) add_executable(${__execname} ${__srcpath}) list(APPEND TEST_EXECS ${__execname}) - target_include_directories(${__execname} PUBLIC ${GTEST_INCLUDE_DIR}) + target_include_directories(${__execname} SYSTEM PUBLIC ${GTEST_INCLUDE_DIR}) target_link_libraries(${__execname} ${TVM_TEST_LIBRARY_NAME} ${GTEST_LIB} pthread dl) set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_ALL 1) set_target_properties(${__execname} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1) diff --git a/Jenkinsfile b/Jenkinsfile index f6faf2542209..9e3b0f5415eb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -92,6 +92,19 @@ def init_git_win() { } } +def cancel_previous_build() { + // cancel previous build if it is not on master. + if (env.BRANCH_NAME != "master") { + def buildNumber = env.BUILD_NUMBER as int + // Milestone API allows us to cancel previous build + // with the same milestone number + if (buildNumber > 1) milestone(buildNumber - 1) + milestone(buildNumber) + } +} + +cancel_previous_build() + stage("Sanity Check") { timeout(time: max_time, unit: 'MINUTES') { node('CPU') { @@ -112,7 +125,11 @@ def make(docker_type, path, make_flag) { sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}" // always run cpp test when build sh "${docker_run} ${docker_type} ./tests/scripts/task_cpp_unittest.sh" - } catch (exc) { + } catch (hudson.AbortException ae) { + // script exited due to user abort, directly throw instead of retry + if (ae.getMessage().contains('script returned exit code 143')) { + throw ae + } echo 'Incremental compilation failed. Fall back to build from scratch' sh "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}" sh "${docker_run} ${docker_type} ./tests/scripts/task_build.sh ${path} ${make_flag}" diff --git a/KEYS b/KEYS index b338c3335f40..ba657b4eb2c7 100644 --- a/KEYS +++ b/KEYS @@ -186,3 +186,61 @@ N8yATW8CRIKO3IobUfLMDdPCLO7uzoW95cI35Y0l8JgK2NeU6tVZptP5mDogeAbq 8PlimrXuzG9Bokct2SOO6Z51i6rSDo/ALj440EvWNw== =1xfH -----END PGP PUBLIC KEY BLOCK----- +pub rsa4096 2020-09-24 [SC] + 6A0D4938D8C052C759AE2460ED03B26E4FC3509F +uid [ultimate] Ziheng Jiang +sig 3 ED03B26E4FC3509F 2020-09-24 Ziheng Jiang +sub rsa4096 2020-09-24 [E] +sig ED03B26E4FC3509F 2020-09-24 Ziheng Jiang + +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBF9tEiUBEAC90om00alNSupM78ZZYMdwKZnJLIhAD22YARntVNVBuD9Znpuo +BAYwjrWdAi/npwN+r+Pd7Oz6fMBCmB3e4tsrPnBzauGb6aKgjBMHcVEx0p1197kk +WcGuKt4FNlHYfmc2sOOQre2GcIVOU1XuK8tAhgca78aorAlMtOqq+/ASnKcjRSjW +0AOzlEKfaGVgst2UO7Fc/w59S3/qv1vBGKnlqLvsJU7kNR1gFotqsGAee5Vu7alQ +WiHFJbW9ujLTPu7m8enFVuBGFkPsW89Yl/0mXnAKZFFNCHIQ9gkT+1bvZhx8ViJL +4UeqG7wnSLSSIQz2UPBJYV5stxNtd9HS08Tfviv37shd1SSprFLoQDk87j7wF60b +AR5IjbVgdprpmVNncO5pnyZwXXWVi7ZyiMSaW6wg+lkeQMGflxgL+05xOafJYgO6 +UepXqu1mc7Q4eVUyft/EPmdyvlg7Fo4T4Db2PnstonkZCyLogdaaJRuxCc0AR/O1 +oNaodrdjqydXVnP3d/gJ5gj78zeMPVbGbzwhpIwhfDouxftaU5zc6prBsMgY/os6 +XMe9bNZWpOLXZrmo/ovaiebmxT5ZYuFRGdeRl1/Y5CWE6Q9JM8euwKuskNQ4G0aY +fVQ61Cxg4hmrnsv9YFAjf9PPWhpFHvILQoGSs3HbJCLFPphKf37gzfiZMQARAQAB +tCBaaWhlbmcgSmlhbmcgPHppaGVuZ0BhcGFjaGUub3JnPokCTgQTAQgAOBYhBGoN +STjYwFLHWa4kYO0Dsm5Pw1CfBQJfbRIlAhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4B +AheAAAoJEO0Dsm5Pw1CfBl0P/R04MxtqC4aI0fpdwmed55kGunL1W65phBgcOrDL +58cv5dKJzUmfSUXw3QANcFSn9Q9Z2clj+a2aiGKV5cUiWN0Ny7y6wd3aVOXlRHHy +f30aDO5Ug5RDYbcChTpen+kq9qDXSr/NxXYLWvhobMeXfiA9Priv49fFWEr17Kai +NOuoix/eWA5WpnPMf/Rz4HibKcX/izXTW0NOH54jn+4P9M4ZwWbn0AXKoq2i3zF2 +vZavCStcscrfs+kihtEVvwUkyrmSIblIUdkPNxeo/jx7N9Fbu2zXbhl5JiBmBUMJ +XyFUOBSUDUzA5EvWwXp0yatULOoCH/LIyt+lLdkyfDjsKmAavGf9CcFHVyDIG95N +34/jECPwwVVkbauE0XYOwenOh+Be1goOA6nidB4QT/rGns7zvCNG8+3ttwA4aiE5 +3GrVWXiPaEMaoM56Phscek30GoLjB2gjvwgwa9oGTDYTu8Z4ifLk8qq8ij9uEG7V +cKns+1C3ZvfdKi8SmOzj/v9krOi8N4YW03YS2Oq/cGPD/SttoMTOCPxi0PLR7uqy +YXugsebxlJlXBNTeZx+iiKmkrsILjEd8pUChw79crtH2SGOPqIv1BsqObstLV04r +iiywruqLRIGlsr8BtepCeEfzW9nJRw7W2571t7oD7QbkdCJ4WUyhMJH73+7KFEE2 +fKL5uQINBF9tEiUBEACmMcP8/zm88BmyhDjWV3ZrZ9cn0N3JJfSONt0AcyE5TZ2y +20DnHkp3/lNK6EC0k7twtcce/cnKDbXQ/IpuJZwReq5SgmCoGbBZShjALtVCzQRm +pSA6Wl0JBfw36/IdKUuf8LZtENqp3jgQkkT3TA+/bCh1KQLDYFoVQjUBLiWCDHiL +iBV5L/PH97l93hkxbSDXrBemQRbr+xhA2TzwcmrjnscNCAXkwU9f1Ygh8zDHSJKB +g7Ln+ot6QsPhNQEQWhju5xfAn9+kO8OWSAZF/lJTT2Wy+spDBP1ZnviQadWPj5HL +n4G1qe4QWl08E9FtqVKC7r1YYzT4DlTU2AQ0bJqdvAtojX9ji2Hp4ov8xYPHzy3a +ZRdDYNWN6i0mbpzj8SYojyEG5cy2j+nzGOYTEdpwW8pG2aCwRvnO+UqXNM3UyQk3 +9Tyfyzw6m9mlq9zaw/nfvOIA6Ns2QR5+UbplkpwVMqMAzZNyEV2wPe9B195MN6tq +KcznzawD/W1ORccOxrpBXhN3sJSc5n8Uy5pHUHg9B/TdCSLpr7tqqS34gB+AcSUL +NxjdLn72JHKxCp/wpg3Z4bmY5n/bh/D7Ovt7LP1D/MW9wiR3ls/PtNAK4+SV6oqt +G1MNS0QgAitovF8dpmX+/zPKax7baZiJY/sDr9crfRvd6e+HYA3yDo08Z44MTwAR +AQABiQI2BBgBCAAgFiEEag1JONjAUsdZriRg7QOybk/DUJ8FAl9tEiUCGwwACgkQ +7QOybk/DUJ9Txw/+NXL6cKEIm4NQrBc0RmX37sELc5UnvpycV663OiPF9qHE9iML +EUt/LBxrGUarplOA66EIkmmnekUgS8ujjhGOw152nSuZTgoPxX4ub6PI7Hi5lmqI +ZtEpp8VoI+XxAdA5ecN5QNP7P/ovSIZwXvIF00YXqGp6keXi/qdYkylt4s6zLDiL +ocfOZWt994JVIl30gogkw4PmcWx+PKXos+Hq1La7iZUn1pT5kEsN+fHpnh42sAGZ +dhb+puB5tczhVJhL553Z6rh4BABd1DqAZihwelkRRvQUp0Fqgc2oxty5o5pdHZdS +ulomOqGgERHsrtwzqD/n3iep3z22LiitZHsKZ0OoHl9e1YsvdsL5rImEz/FdWwgl +muO2ZjY2KhuovFROCsGVgw3b9gzIjtE8FWE6wSz6qzKihBbI8YPtQqGJgnX2A01m +AkfpGPb8430OghDCQFsrWkuTjmSw42ys1lALbK2yQGRuOCq0dIml1QdE6JfU8ceW +QY1dhH7xpHQxlr9Tcv+enCc4UzCJOnXgkVUnD/u+TqKL9GoSFu6KQrC7jyvfY9t8 +Elf2ReXYVK/jGUePdDFurp+3KFlAHFuen2VZTcNZaUWoYoI84VDEh8/oPEPfzveJ +/GhL5vbglB0H0aG8SVMaTfzr+nXHUVyOSrlYYk34O7bSimVrX6XDGPZpsXE= +=nhJ/ +-----END PGP PUBLIC KEY BLOCK----- diff --git a/README.md b/README.md index 3f5b172f6268..f0e011bfc616 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ [Community](https://tvm.apache.org/community) | [Release Notes](NEWS.md) -[![Build Status](https://ci.tvm.ai/buildStatus/icon?job=tvm/master)](https://ci.tvm.ai/job/tvm/job/master/) +[![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/master)](https://ci.tlcpack.ai/job/tvm/job/master/) [![WinMacBuild](https://github.com/apache/incubator-tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/incubator-tvm/actions?query=workflow%3AWinMacBuild) Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the diff --git a/apps/tf_tvmdsoop/CMakeLists.txt b/apps/tf_tvmdsoop/CMakeLists.txt index f4e83c528701..9be5da9d26de 100644 --- a/apps/tf_tvmdsoop/CMakeLists.txt +++ b/apps/tf_tvmdsoop/CMakeLists.txt @@ -22,8 +22,8 @@ set(BUILD_TVMDSOOP_ONLY ON) set(CMAKE_CURRENT_SOURCE_DIR ${TVM_ROOT}) set(CMAKE_CURRENT_BINARY_DIR ${TVM_ROOT}/build) -include_directories(${TVM_ROOT}/3rdparty/dlpack/include/) -include_directories(${TVM_ROOT}/3rdparty/dmlc-core/include/) +include_directories(SYSTEM ${TVM_ROOT}/3rdparty/dlpack/include/) +include_directories(SYSTEM ${TVM_ROOT}/3rdparty/dmlc-core/include/) include_directories(${TVM_ROOT}/include) link_directories(${TVM_ROOT}/build) diff --git a/cmake/config.cmake b/cmake/config.cmake index e7e2a5a8a5c9..6ed660c86964 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -87,17 +87,6 @@ set(USE_OPENGL OFF) # Whether enable MicroTVM runtime set(USE_MICRO OFF) -# Whether to enable SGX runtime -# -# Possible values for USE_SGX: -# - /path/to/sgxsdk: path to Intel SGX SDK -# - OFF: disable SGX -# -# SGX_MODE := HW|SIM -set(USE_SGX OFF) -set(SGX_MODE "SIM") -set(RUST_SGX_SDK "/path/to/rust-sgx-sdk") - # Whether enable RPC runtime set(USE_RPC ON) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 936bb681b7ff..2583e8f3c9ca 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -21,7 +21,7 @@ find_cuda(${USE_CUDA}) if(CUDA_FOUND) # always set the includedir when cuda is available # avoid global retrigger of cmake - include_directories(${CUDA_INCLUDE_DIRS}) + include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) endif(CUDA_FOUND) if(USE_CUDA) diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake new file mode 100644 index 000000000000..9a3ac05a2a5b --- /dev/null +++ b/cmake/modules/ClangFlags.cmake @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# If we are running clang >= 10.0 then enable more checking. Some of these warnings may not exist +# in older versions of clang so we limit the use of older clang for these checks. +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_full_version) + string (REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION ${clang_full_version}) + message(STATUS "CLANG_VERSION ${CLANG_VERSION}") + if (CLANG_VERSION VERSION_GREATER_EQUAL 10.0) + message(STATUS "Setting enhanced clang warning flags") + + # These warnings are only enabled when clang's -Weverything flag is enabled + # but there is no harm in turning them off for all cases. + add_compile_options( + -Wno-c++98-compat + -Wno-c++98-compat-extra-semi + -Wno-c++98-compat-pedantic + -Wno-padded + -Wno-extra-semi + -Wno-extra-semi-stmt + -Wno-unused-parameter + -Wno-sign-conversion + -Wno-weak-vtables + -Wno-deprecated-copy-dtor + -Wno-global-constructors + -Wno-double-promotion + -Wno-float-equal + -Wno-missing-prototypes + -Wno-implicit-int-float-conversion + -Wno-implicit-float-conversion + -Wno-implicit-int-conversion + -Wno-float-conversion + -Wno-shorten-64-to-32 + -Wno-covered-switch-default + -Wno-unused-exception-parameter + -Wno-return-std-move-in-c++11 + -Wno-over-aligned + -Wno-undef + -Wno-inconsistent-missing-destructor-override + -Wno-unreachable-code + -Wno-deprecated-copy + -Wno-implicit-fallthrough + -Wno-unreachable-code-return + -Wno-non-virtual-dtor + ) + + # Here we have non-standard warnings that clang has available and are useful + # so enable them if we are using clang. + add_compile_options( + -Wreserved-id-macro + -Wused-but-marked-unused + -Wdocumentation-unknown-command + -Wcast-qual + -Wzero-as-null-pointer-constant + + # These warnings should be enabled one at a time and fixed. + # To enable one of these warnings remove the `no-` after -W so + # -Wno-documentation -> -Wdocumentation + -Wno-documentation + -Wno-shadow-uncaptured-local + -Wno-shadow-field-in-constructor + -Wno-shadow + -Wno-shadow-field + -Wno-exit-time-destructors + -Wno-switch-enum + -Wno-old-style-cast + -Wno-gnu-anonymous-struct + -Wno-nested-anon-types + ) + + endif () +endif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index 30b4ccbc5618..9fc806cf9047 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -76,7 +76,7 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}") find_hexagon_toolchain() message(STATUS "Hexagon toolchain: ${HEXAGON_TOOLCHAIN}") file(GLOB RUNTIME_HEXAGON_SIM_SRCS src/runtime/hexagon/sim/*.cc) - include_directories("${HEXAGON_TOOLCHAIN}/include/iss") + include_directories(SYSTEM "${HEXAGON_TOOLCHAIN}/include/iss") link_directories("${HEXAGON_TOOLCHAIN}/lib/iss") list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper") ExternalProject_Add(sim_dev @@ -91,11 +91,11 @@ elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}") find_hexagon_toolchain() message(STATUS "Hexagon SDK: ${HEXAGON_SDK_ROOT}") file(GLOB RUNTIME_HEXAGON_DEVICE_SRCS src/runtime/hexagon/target/*.cc) - include_directories("${HEXAGON_SDK_ROOT}/incs/stddef") - include_directories("${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc") + include_directories(SYSTEM "${HEXAGON_SDK_ROOT}/incs/stddef") + include_directories(SYSTEM "${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc") include_directories( - "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64") - include_directories("${HEXAGON_TOOLCHAIN}/include/iss") + SYSTEM "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64") + include_directories(SYSTEM "${HEXAGON_TOOLCHAIN}/include/iss") list(APPEND TVM_RUNTIME_LINKER_LIBS "dl") if(BUILD_FOR_ANDROID) # Hexagon runtime uses __android_log_print, which is in liblog. diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake index be6d0fb27242..5f8ace17111f 100644 --- a/cmake/modules/LLVM.cmake +++ b/cmake/modules/LLVM.cmake @@ -18,9 +18,11 @@ # LLVM rules add_definitions(-DDMLC_USE_FOPEN64=0) -if(NOT USE_LLVM STREQUAL "OFF") +# Test if ${USE_LLVM} is not an explicit boolean false +# It may be a boolean or a string +if(NOT ${USE_LLVM} MATCHES ${IS_FALSE_PATTERN}) find_llvm(${USE_LLVM}) - include_directories(${LLVM_INCLUDE_DIRS}) + include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) message(STATUS "Build with LLVM " ${LLVM_PACKAGE_VERSION}) message(STATUS "Set TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION}) diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake index 64ce3c3d3810..a82c080c4199 100644 --- a/cmake/modules/OpenCL.cmake +++ b/cmake/modules/OpenCL.cmake @@ -21,7 +21,7 @@ find_opencl(${USE_OPENCL}) if(OpenCL_FOUND) # always set the includedir when cuda is available # avoid global retrigger of cmake - include_directories(${OpenCL_INCLUDE_DIRS}) + include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS}) endif(OpenCL_FOUND) if(USE_SDACCEL) diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake index a28b37ebe465..ec348f8b57f6 100644 --- a/cmake/modules/ROCM.cmake +++ b/cmake/modules/ROCM.cmake @@ -21,7 +21,7 @@ find_rocm(${USE_ROCM}) if(ROCM_FOUND) # always set the includedir # avoid global retrigger of cmake - include_directories(${ROCM_INCLUDE_DIRS}) + include_directories(SYSTEM ${ROCM_INCLUDE_DIRS}) add_definitions(-D__HIP_PLATFORM_HCC__=1) endif(ROCM_FOUND) diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index fd3a121ef795..a9fc66507d35 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -59,7 +59,7 @@ elseif(PYTHON) list(APPEND FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc) # Target lib: vta_fsim add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS}) - target_include_directories(vta_fsim PUBLIC ${VTA_HW_PATH}/include) + target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_fsim PUBLIC ${__strip_def}) @@ -80,7 +80,7 @@ elseif(PYTHON) list(APPEND TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc) # Target lib: vta_tsim add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS}) - target_include_directories(vta_tsim PUBLIC ${VTA_HW_PATH}/include) + target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_tsim PUBLIC ${__strip_def}) @@ -116,9 +116,9 @@ elseif(PYTHON) target_link_libraries(vta ${__cma_lib}) elseif(${VTA_TARGET} STREQUAL "de10nano") # DE10-Nano rules #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21) - target_include_directories(vta PUBLIC ${VTA_HW_PATH}/src/de10nano) - target_include_directories(vta PUBLIC 3rdparty) - target_include_directories(vta PUBLIC + target_include_directories(vta SYSTEM PUBLIC ${VTA_HW_PATH}/src/de10nano) + target_include_directories(vta SYSTEM PUBLIC 3rdparty) + target_include_directories(vta SYSTEM PUBLIC "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") endif() endif() diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake index 29fcfe946361..4df8986c800c 100644 --- a/cmake/modules/Vulkan.cmake +++ b/cmake/modules/Vulkan.cmake @@ -29,7 +29,7 @@ tvm_option(USE_VULKAN_VALIDATION "Enable Vulkan API validation layers" OFF if(Vulkan_FOUND) # always set the includedir # avoid global retrigger of cmake - include_directories(${Vulkan_INCLUDE_DIRS}) + include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS}) endif(Vulkan_FOUND) if(USE_VULKAN) diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake index e8c8e22e3334..06c8755882d5 100644 --- a/cmake/modules/contrib/BLAS.cmake +++ b/cmake/modules/contrib/BLAS.cmake @@ -27,7 +27,7 @@ elseif(USE_BLAS STREQUAL "atlas" OR USE_BLAS STREQUAL "blas") message(STATUS "Use BLAS library " ${BLAS_LIBRARY}) elseif(USE_BLAS STREQUAL "apple") find_library(BLAS_LIBRARY Accelerate) - include_directories(${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/) + include_directories(SYSTEM ${BLAS_LIBRARY}/Versions/Current/Frameworks/vecLib.framework/Versions/Current/Headers/) list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY}) list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/cblas.cc) message(STATUS "Use BLAS library " ${BLAS_LIBRARY}) @@ -57,7 +57,7 @@ if(USE_MKL OR USE_MKL_PATH) elseif(MSVC) find_library(BLAS_LIBRARY_MKL NAMES mkl_rt HINTS ${USE_MKL}/lib/ ${USE_MKL}/lib/intel64_win) endif() - include_directories(${USE_MKL}/include) + include_directories(SYSTEM ${USE_MKL}/include) list(APPEND TVM_RUNTIME_LINKER_LIBS ${BLAS_LIBRARY_MKL}) list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkl.cc) add_definitions(-DUSE_MKL_BLAS=1) @@ -69,7 +69,7 @@ if(IS_DIRECTORY ${USE_MKLDNN}) if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND") message(WARNING "Cannot find MKLDNN library at ${USE_MKLDNN}.") else() - include_directories(${USE_MKLDNN}/include) + include_directories(SYSTEM ${USE_MKLDNN}/include) list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY}) list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc) add_definitions(-DUSE_DNNL=1) diff --git a/cmake/modules/contrib/EthosN.cmake b/cmake/modules/contrib/EthosN.cmake index ca1f7daa8c8a..44d2a2a17ace 100644 --- a/cmake/modules/contrib/EthosN.cmake +++ b/cmake/modules/contrib/EthosN.cmake @@ -24,7 +24,7 @@ if(NOT USE_ETHOSN STREQUAL "OFF") message(FATAL_ERROR "Cannot find Ethos-N, USE_ETHOSN=" ${USE_ETHOSN}) else() - include_directories(${ETHOSN_INCLUDE_DIRS}) + include_directories(SYSTEM ${ETHOSN_INCLUDE_DIRS}) add_definitions(${ETHOSN_DEFINITIONS}) message(STATUS "Build with Ethos-N ${ETHOSN_PACKAGE_VERSION}") diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake index f971c87f20a8..e6517517a6d0 100644 --- a/cmake/util/FindCUDA.cmake +++ b/cmake/util/FindCUDA.cmake @@ -37,7 +37,7 @@ # macro(find_cuda use_cuda) set(__use_cuda ${use_cuda}) - if(__use_cuda STREQUAL "ON") + if(${__use_cuda} MATCHES ${IS_TRUE_PATTERN}) find_package(CUDA QUIET) elseif(IS_DIRECTORY ${__use_cuda}) set(CUDA_TOOLKIT_ROOT_DIR ${__use_cuda}) @@ -91,7 +91,9 @@ macro(find_cuda use_cuda) find_library(CUDA_CUBLAS_LIBRARY cublas ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib) - find_library(CUDA_CUBLASLT_LIBRARY cublaslt + find_library(CUDA_CUBLASLT_LIBRARY + NAMES cublaslt cublasLt + PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib) endif(MSVC) diff --git a/cmake/util/FindEthosN.cmake b/cmake/util/FindEthosN.cmake index 7f0fb6499691..d33b55f0c7a9 100644 --- a/cmake/util/FindEthosN.cmake +++ b/cmake/util/FindEthosN.cmake @@ -60,7 +60,7 @@ macro(find_ethosn use_ethosn) set(ETHOSN_PACKAGE_VERSION "0.1.1") - if(USE_ETHOSN_HW STREQUAL "ON") + if(${USE_ETHOSN_HW} MATCHES ${IS_TRUE_PATTERN}) # Runtime hardware support find_path(_DL_DIR NAMES Network.hpp PATHS ${__ethosn_stack}/include/ethosn_driver_library) @@ -79,7 +79,7 @@ macro(find_ethosn use_ethosn) endif(__ethosn_stack) if(NOT ETHOSN_FOUND) - if(__use_ethosn STREQUAL "ON") + if(${__use_ethosn} MATCHES ${IS_TRUE_PATTERN}) message(WARNING "No cmake find_package available for Arm Ethos-N") endif() diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake index d8378803023d..b8c5bf815bf5 100644 --- a/cmake/util/FindLLVM.cmake +++ b/cmake/util/FindLLVM.cmake @@ -19,10 +19,13 @@ # Enhanced version of find llvm. # # Usage: -# find_llvm(${USE_LLVM}) +# find_llvm(${USE_LLVM} [${LLVM_VERSION}]) # # - When USE_LLVM=ON, use auto search # - When USE_LLVM=/path/to/llvm-config, use corresponding config +# - The optional LLVM_VERSION= which will find a matching version +# If LLVM_VERSION=10 then it will find 10.X.X, any version of 10 +# If LLVM_VERSION=10.2 then it will find 10.2.X # # Provide variables: # - LLVM_INCLUDE_DIRS @@ -32,19 +35,25 @@ # - TVM_INFO_LLVM_VERSION # macro(find_llvm use_llvm) + if(${use_llvm} MATCHES ${IS_FALSE_PATTERN}) + return() + endif() set(LLVM_CONFIG ${use_llvm}) - if(LLVM_CONFIG STREQUAL "ON") - find_package(LLVM REQUIRED CONFIG) + if(${ARGC} EQUAL 2) + set(llvm_version_required ${ARGV1}) + endif() + + if(${LLVM_CONFIG} MATCHES ${IS_TRUE_PATTERN}) + find_package(LLVM ${llvm_version_required} REQUIRED CONFIG) llvm_map_components_to_libnames(LLVM_LIBS "all") if (NOT LLVM_LIBS) message(STATUS "Not found - LLVM_LIBS") message(STATUS "Fall back to using llvm-config") - set(LLVM_CONFIG "llvm-config") - else() - set(LLVM_CONFIG "ON") + set(LLVM_CONFIG "${LLVM_TOOLS_BINARY_DIR}/llvm-config") endif() endif() - if(LLVM_CONFIG STREQUAL "ON") + + if(LLVM_LIBS) # check if defined, not if it is true list (FIND LLVM_LIBS "LLVM" _llvm_dynlib_index) if (${_llvm_dynlib_index} GREATER -1) set(LLVM_LIBS LLVM) @@ -55,7 +64,7 @@ macro(find_llvm use_llvm) endif() set(TVM_LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}) set(TVM_INFO_LLVM_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}") - elseif(NOT LLVM_CONFIG STREQUAL "OFF") + else() # use llvm config message(STATUS "Use llvm-config=" ${LLVM_CONFIG}) separate_arguments(LLVM_CONFIG) @@ -145,13 +154,11 @@ macro(find_llvm use_llvm) list(APPEND LLVM_LIBS "${__flag}") endforeach() endif() - if(NOT LLVM_CONFIG STREQUAL "OFF") - message(STATUS "Found LLVM_INCLUDE_DIRS=" "${LLVM_INCLUDE_DIRS}") - message(STATUS "Found LLVM_DEFINITIONS=" "${LLVM_DEFINITIONS}") - message(STATUS "Found LLVM_LIBS=" "${LLVM_LIBS}") - message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION}) - if (${TVM_LLVM_VERSION} LESS 40) - message(FATAL_ERROR "TVM requires LLVM 4.0 or higher.") - endif() + message(STATUS "Found LLVM_INCLUDE_DIRS=" "${LLVM_INCLUDE_DIRS}") + message(STATUS "Found LLVM_DEFINITIONS=" "${LLVM_DEFINITIONS}") + message(STATUS "Found LLVM_LIBS=" "${LLVM_LIBS}") + message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION}) + if (${TVM_LLVM_VERSION} LESS 40) + message(FATAL_ERROR "TVM requires LLVM 4.0 or higher.") endif() endmacro(find_llvm) diff --git a/cmake/util/FindOpenCL.cmake b/cmake/util/FindOpenCL.cmake index 2510c01a1bec..9b9f8ec94593 100644 --- a/cmake/util/FindOpenCL.cmake +++ b/cmake/util/FindOpenCL.cmake @@ -58,7 +58,7 @@ macro(find_opencl use_opencl) # No user provided OpenCL include/libs found if(NOT OpenCL_FOUND) - if(__use_opencl STREQUAL "ON") + if(${__use_opencl} MATCHES ${IS_TRUE_PATTERN}) find_package(OpenCL QUIET) endif() endif() diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake index 78b242c8bec3..7d4e282956d9 100644 --- a/cmake/util/FindROCM.cmake +++ b/cmake/util/FindROCM.cmake @@ -21,8 +21,8 @@ # Usage: # find_rocm(${USE_ROCM}) # -# - When USE_VULKAN=ON, use auto search -# - When USE_VULKAN=/path/to/vulkan-sdk-path, use the sdk +# - When USE_ROCM=ON, use auto search +# - When USE_ROCM=/path/to/rocm-sdk-path, use the sdk # # Provide variables: # diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake index 00ed7dc61716..feb5eec74d89 100644 --- a/cmake/util/FindVulkan.cmake +++ b/cmake/util/FindVulkan.cmake @@ -53,7 +53,7 @@ macro(find_vulkan use_vulkan) # resort to find vulkan of option is on if(NOT Vulkan_FOUND) - if(__use_vulkan STREQUAL "ON") + if(${__use_vulkan} MATCHES ${IS_TRUE_PATTERN}) find_package(Vulkan QUIET) endif() endif() diff --git a/cmake/util/Util.cmake b/cmake/util/Util.cmake index d105c82a3082..4e6762b14894 100644 --- a/cmake/util/Util.cmake +++ b/cmake/util/Util.cmake @@ -74,3 +74,13 @@ function(assign_source_group group) source_group("${group}\\${_source_path_msvc}" FILES "${_source}") endforeach() endfunction(assign_source_group) + +# From cmake documentation: +# True if the constant is 1, ON, YES, TRUE, Y, or a non-zero number. +# False if the constant is 0, OFF, NO, FALSE, N, IGNORE, NOTFOUND, the empty string, or ends in the suffix -NOTFOUND. +# Named boolean constants are case-insensitive. +# +# While this regex does contain a check for an empty string that check does not work +# cmake's regex is weak +set(IS_FALSE_PATTERN "^[Oo][Ff][Ff]$|^0$|^[Ff][Aa][Ll][Ss][Ee]$|^[Nn][Oo]$|^[Nn][Oo][Tt][Ff][Oo][Uu][Nn][Dd]$|.*-[Nn][Oo][Tt][Ff][Oo][Uu][Nn][Dd]$|^$") +set(IS_TRUE_PATTERN "^[Oo][Nn]$|^[1-9][0-9]*$|^[Tt][Rr][Uu][Ee]$|^[Yy][Ee][Ss]$|^[Yy]$") diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh index 2ad55c0e521e..2b8df74dab7b 100755 --- a/docker/install/ubuntu_install_python_package.sh +++ b/docker/install/ubuntu_install_python_package.sh @@ -21,4 +21,4 @@ set -u set -o pipefail # install libraries for python package on ubuntu -pip3 install pylint==1.9.4 six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging +pip3 install six numpy pytest cython decorator scipy tornado typed_ast pytest mypy orderedset attrs requests Pillow packaging diff --git a/docs/api/python/auto_scheduler.rst b/docs/api/python/auto_scheduler.rst index 85ff22f58b37..c5b8dccf1be2 100644 --- a/docs/api/python/auto_scheduler.rst +++ b/docs/api/python/auto_scheduler.rst @@ -18,18 +18,7 @@ tvm.auto_scheduler ------------------ .. automodule:: tvm.auto_scheduler - -tvm.auto_scheduler.auto_schedule -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: tvm.auto_scheduler.auto_schedule - -.. autoclass:: tvm.auto_scheduler.auto_schedule.SearchTask - -.. autoclass:: tvm.auto_scheduler.auto_schedule.TuningOptions - -.. autofunction:: tvm.auto_scheduler.auto_schedule.create_task - -.. autofunction:: tvm.auto_scheduler.auto_schedule.auto_schedule - - + :members: + :imported-members: + :autosummary: diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index e3399c57db26..1ff034a2cd8d 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -232,6 +232,8 @@ Operator support +----------------------+-------------------------------------------------------------------------+ | reshape | fp32, uint8 | +----------------------+-------------------------------------------------------------------------+ +| maximum | fp32 | ++----------------------+-------------------------------------------------------------------------+ .. note:: A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst index 775991636167..2bb6e551b1a0 100644 --- a/docs/install/from_source.rst +++ b/docs/install/from_source.rst @@ -173,8 +173,8 @@ Method 1 This method is **recommended for developers** who may change the codes. Set the environment variable `PYTHONPATH` to tell python where to find - the library. For example, assume we cloned `tvm` on the home directory - `~`. then we can added the following line in `~/.bashrc`. + the library. For example, assume we cloned `tvm` on the directory + `/path/to/tvm` then we can add the following line in `~/.bashrc`. The changes will be immediately reflected once you pull the code and rebuild the project (no need to call ``setup`` again) .. code:: bash @@ -242,7 +242,7 @@ tests in TVM. The easiest way to install GTest is from source. cd build cmake .. make - make install + sudo make install After installing GTest, the C++ tests can be built and started with ``./tests/scripts/task_cpp_unittest.sh`` or just built with ``make cpptest``. diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst index 5fae67d59492..984e83c56a8e 100644 --- a/docs/langref/hybrid_script.rst +++ b/docs/langref/hybrid_script.rst @@ -38,12 +38,12 @@ you need to use ``tvm.te.hybrid.script`` decorator to indicate this is a hybrid .. code-block:: python @tvm.te.hybrid.script - def outer_product(a, b, c): + def outer_product(a, b): c = output_tensor((100, 99), 'float32') for i in range(a.shape[0]): for j in range(b.shape[0]): c[i, j] = a[i] * b[j] - return c + return c a = numpy.random.randn(100) b = numpy.random.randn(99) c = outer_product(a, b) @@ -76,7 +76,7 @@ or ``tvm.container.Array``, to this function, it returns a op node: a = tvm.te.placeholder((100, ), name='a') b = tvm.te.placeholder((99, ), name='b') - c = outer_product(a, b, c) # return the output tensor(s) of the operator + c = outer_product(a, b) # return the output tensor(s) of the operator You can use any methods that can be applied on a TVM ``OpNode``, like create_schedule, although so far, the functionality of schedule is as limited as ``ExternOpNode``. At least, it can be built @@ -230,5 +230,8 @@ Assert statement is supported, you can simply use it as it is in standard Python Keywords ~~~~~~~~ -- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``, ``const_expr`` -- Math keywords: ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, ``popcount`` +- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``, ``const_range`` +- Math keywords: ``log``, ``exp``, ``sqrt``, ``rsqrt``, ``sigmoid``, ``tanh``, ``power``, ``popcount``, ``round``, ``ceil_div`` +- Allocate keywords: ``allocate``, ``output_tensor`` +- Data type keywords: ``uint8``, ``uint16``, ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``, ``float16``, ``float32``, ``float64`` +- Others: ``max_num_threads`` diff --git a/include/tvm/auto_scheduler/cost_model.h b/include/tvm/auto_scheduler/cost_model.h index 89dcab29265d..f7a27895a7d0 100755 --- a/include/tvm/auto_scheduler/cost_model.h +++ b/include/tvm/auto_scheduler/cost_model.h @@ -70,6 +70,11 @@ class CostModelNode : public Object { LOG(FATAL) << "Not implemented"; } + /*! + * \brief Default virtual destructor + */ + virtual ~CostModelNode() {} + static constexpr const char* _type_key = "auto_scheduler.CostModel"; TVM_DECLARE_BASE_OBJECT_INFO(CostModelNode, Object); }; diff --git a/include/tvm/auto_scheduler/search_policy.h b/include/tvm/auto_scheduler/search_policy.h index 176b10c1d7ea..ddb0dd284875 100755 --- a/include/tvm/auto_scheduler/search_policy.h +++ b/include/tvm/auto_scheduler/search_policy.h @@ -65,6 +65,7 @@ #include #include +#include #include #include @@ -191,7 +192,7 @@ class SearchPolicyNode : public Object { * We store the string format of a state for redundancy check. This is used to make sure a * measured state will never be measured again. */ - std::unordered_set measured_states_set_; + std::unordered_set measured_states_set_; /*! \brief The array of already measured states. * The good states can be used as the initial population in evolutionary search. */ std::vector measured_states_vector_; diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h index 7981d58b0ead..0bd1ce6c2ac1 100644 --- a/include/tvm/ir/attrs.h +++ b/include/tvm/ir/attrs.h @@ -360,7 +360,7 @@ struct AttrInitEntry { } // override fields. // This function sets the lower bound of the attribute - TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) { + TSelf& set_lower_bound(const T& begin) { if (this->value_missing_) return *this; const T& val = *value_; if (begin > val) { @@ -372,7 +372,7 @@ struct AttrInitEntry { return *this; } // This function sets the upper bound of the attribute - TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) { + TSelf& set_upper_bound(const T& end) { if (this->value_missing_) return *this; const T& val = *value_; if (val > end) { @@ -384,7 +384,7 @@ struct AttrInitEntry { return *this; } // set default when - TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) { + TSelf& set_default(const T& value) { if (!value_missing_) return *this; *value_ = value; value_missing_ = false; @@ -548,12 +548,12 @@ class AttrDocEntry { using TSelf = AttrDocEntry; explicit AttrDocEntry(ObjectPtr info) : info_(info) {} - TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + TSelf& describe(const char* str) { info_->description = str; return *this; } template - TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) { + TSelf& set_default(const T& value) { std::ostringstream os; os << info_->type_info << ", default=" << value; info_->type_info = os.str(); diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h index 7c5484bfa7cb..74dabc168924 100644 --- a/include/tvm/node/container.h +++ b/include/tvm/node/container.h @@ -1104,7 +1104,7 @@ class DenseMapNode : public MapNode { friend class MapNode; }; -#define _TVM_DISPATCH_MAP(base, var, body) \ +#define TVM_DISPATCH_MAP(base, var, body) \ { \ using TSmall = SmallMapNode*; \ using TDense = DenseMapNode*; \ @@ -1118,68 +1118,68 @@ class DenseMapNode : public MapNode { } \ } -#define _TVM_DISPATCH_MAP_CONST(base, var, body) \ - { \ - using TSmall = const SmallMapNode*; \ - using TDense = const DenseMapNode*; \ - uint64_t slots = base->slots_; \ - if (slots <= SmallMapNode::kMaxSize) { \ - TSmall var = static_cast(base); \ - body; \ - } else { \ - TDense var = static_cast(base); \ - body; \ - } \ +#define TVM_DISPATCH_MAP_CONST(base, var, body) \ + { \ + using TSmall = const SmallMapNode*; \ + using TDense = const DenseMapNode*; \ + uint64_t slots = base->slots_; \ + if (slots <= SmallMapNode::kMaxSize) { \ + TSmall var = static_cast(base); \ + body; \ + } else { \ + TDense var = static_cast(base); \ + body; \ + } \ } inline MapNode::iterator::pointer MapNode::iterator::operator->() const { - _TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); }); + TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); }); } inline MapNode::iterator& MapNode::iterator::operator++() { - _TVM_DISPATCH_MAP_CONST(self, p, { + TVM_DISPATCH_MAP_CONST(self, p, { index = p->IncItr(index); return *this; }); } inline MapNode::iterator& MapNode::iterator::operator--() { - _TVM_DISPATCH_MAP_CONST(self, p, { + TVM_DISPATCH_MAP_CONST(self, p, { index = p->IncItr(index); return *this; }); } inline size_t MapNode::count(const key_type& key) const { - _TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); }); + TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); }); } inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const { - _TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); }); + TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); }); } inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) { - _TVM_DISPATCH_MAP(this, p, { return p->at(key); }); + TVM_DISPATCH_MAP(this, p, { return p->at(key); }); } inline MapNode::iterator MapNode::begin() const { - _TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); }); + TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); }); } inline MapNode::iterator MapNode::end() const { - _TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); }); + TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); }); } inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const { - _TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); }); + TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); }); } inline void MapNode::erase(const MapNode::iterator& position) { - _TVM_DISPATCH_MAP(this, p, { return p->erase(position); }); + TVM_DISPATCH_MAP(this, p, { return p->erase(position); }); } -#undef _TVM_DISPATCH_MAP -#undef _TVM_DISPATCH_MAP_CONST +#undef TVM_DISPATCH_MAP +#undef TVM_DISPATCH_MAP_CONST inline ObjectPtr MapNode::Empty() { return SmallMapNode::Empty(); } diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h index 00da9408408b..b2e7c500eddd 100644 --- a/include/tvm/relay/analysis.h +++ b/include/tvm/relay/analysis.h @@ -80,7 +80,7 @@ TVM_DLL bool BasicBlockNormalFormCheck(const Expr& e); * * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice. * - * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice, + * `let f = (x -> x) in let g = (x -> x + 1) in f(g(2))` also bound x twice, * although x is not shadowed. * * \param expr the expression to check. diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h index 6bfdb492fed0..fbe31a305ea5 100644 --- a/include/tvm/relay/attrs/nn.h +++ b/include/tvm/relay/attrs/nn.h @@ -596,11 +596,13 @@ struct Conv2DTransposeAttrs : public tvm::AttrsNode { /*! \brief Attributes used in dilate operator */ struct DilateAttrs : public tvm::AttrsNode { Array strides; + double dilation_value; TVM_DECLARE_ATTRS(DilateAttrs, "relay.attrs.DilateAttrs") { TVM_ATTR_FIELD(strides) .set_default(Array({1, 1})) .describe("Dilation stride on each dimension, 1 means no dilation."); + TVM_ATTR_FIELD(dilation_value).set_default(0.0).describe("Value used to dilate the input."); } }; diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index a03e15a5a836..683f5a28b4f4 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -381,6 +381,25 @@ struct OneHotAttrs : public tvm::AttrsNode { } }; // struct OneHotAttrs +/*! \brief Attributes used in matrix_set_diag operator */ +struct MatrixSetDiagAttrs : public tvm::AttrsNode { + int k1; + int k2; + bool super_diag_right_align; + bool sub_diag_right_align; + + TVM_DECLARE_ATTRS(MatrixSetDiagAttrs, "relay.attrs.MatrixSetDiagAttrs") { + TVM_ATTR_FIELD(k1).set_default(0).describe("Lower limit (included) of the range of diagonals."); + TVM_ATTR_FIELD(k2).set_default(0).describe("Upper limit (included) of the range of diagonals."); + TVM_ATTR_FIELD(super_diag_right_align) + .set_default(true) + .describe("Bool, true iff super-diagonal is right aligned (left-padded)."); + TVM_ATTR_FIELD(sub_diag_right_align) + .set_default(false) + .describe("Bool, true iff sub-diagonal is right aligned (left-padded)."); + } +}; // struct MatrixSetDiagAttrs + } // namespace relay } // namespace tvm #endif // TVM_RELAY_ATTRS_TRANSFORM_H_ diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h index a021402e097c..9b5a8047740e 100644 --- a/include/tvm/topi/nn/dilate.h +++ b/include/tvm/topi/nn/dilate.h @@ -55,19 +55,20 @@ PrimExpr all(Array args) { } /*! - * \brief Dilate data with zeros + * \brief Dilate data with given dilation value (0 by default). * * \param x The input tensor, this can have any number of * dimensions and any layout. * \param strides Dilation stride for each dimension. Stride 1 * means no dilation. + * \param dilation_value Value used to dilate the input. * \param name The name of the operation * \param tag The tag to mark the operation * * \return The output tensor. */ -inline Tensor dilate(const Tensor& x, Array strides, std::string name = "tensor", - std::string tag = kInjective) { +inline Tensor dilate(const Tensor& x, Array strides, double dilation_value, + std::string name = "tensor", std::string tag = kInjective) { auto n = x->shape.size(); CHECK_EQ(n, strides.size()) << "strides size (" << strides.size() << ") must match dimension of x (" << n << ")"; @@ -94,7 +95,8 @@ inline Tensor dilate(const Tensor& x, Array strides, std::string name } if (not_zero.size() > 0) { auto all_not_zero = all(not_zero); - return tvm::if_then_else(all_not_zero, x(index_tuple), make_const(x->dtype, 0)); + return tvm::if_then_else(all_not_zero, x(index_tuple), + make_const(x->dtype, dilation_value)); } return x(index_tuple); }, diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h index b6852ffd01c3..935d399a6604 100644 --- a/include/tvm/topi/nn/pooling.h +++ b/include/tvm/topi/nn/pooling.h @@ -336,7 +336,9 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x, inline bool find_depth_height_width(const std::string& layout, int* depth_axis, int* height_axis, int* width_axis) { - *depth_axis = -1, *height_axis = -1, *width_axis = -1; + *depth_axis = -1; + *height_axis = -1; + *width_axis = -1; int curr_idx = 0; for (size_t i = 0; i < layout.size(); ++i) { if ((layout[i] >= 'A' && layout[i] <= 'Z') || (layout[i] >= 'a' && layout[i] <= 'z')) { diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 2c0d102e35b1..e01eb703cb99 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -1524,29 +1524,60 @@ inline Tensor sparse_to_dense(const Tensor& sparse_indices, const Array } /*! - * \brief Returns a tensor with the diagonal of input tensor replaced with the provided diagonal. + * \brief Returns a tensor with the diagonal of input tensor replaced with the provided diagonals. * \param input input tensor. - * \param diagonal values to be filled in the diagonal. + * \param diagonal values to be filled in the diagonals. + * \param k1 lower limit (included) of the range of diagonals. + * \param k2 upper limit (included) of the range of diagonals. + * \param super_diag_right_align bool, true iff super-diagonal is right aligned (left-padded). + * \param sub_diag_right_align bool, true iff sub-diagonal is right aligned (left-padded). * \param name output tensor name. * \param tag output tensor tag. * \return new tensor with given diagonal values. */ -inline Tensor matrix_set_diag(const Tensor& input, const Tensor& diagonal, +inline Tensor matrix_set_diag(const Tensor& input, const Tensor& diagonal, int k1, int k2, + bool super_diag_right_align, bool sub_diag_right_align, const std::string name = "T_matrix_set_diag", const std::string tag = kInjective) { size_t ndim = input->shape.size() - 1; + bool only_one_diagonal = k1 == k2; + return compute( input->shape, [&](const Array& iter_vars) { auto get_diag = [&]() { Array diagonal_indices; - for (size_t i = 0; i < ndim; i++) { + PrimExpr k, offset = 0; + for (size_t i = 0; i < ndim - 1; i++) { diagonal_indices.push_back(iter_vars[i]); } + if (only_one_diagonal) { + k = k1; + } else { + // Determining which diagonal/sub-diagonal/super-diagonal it is + k = iter_vars[ndim] - iter_vars[ndim - 1]; + diagonal_indices.push_back(k2 - k); + + // Calculating the offset in diagonal tensor for this diagonal + auto get_offset = [&](PrimExpr M, PrimExpr N) { + // offset = max_diagonal_length - diagonal_length + return diagonal->shape[diagonal->shape.size() - 1] - if_then_else(M < N, M, N); + }; + offset = if_then_else( + k >= 0, + super_diag_right_align ? get_offset(input->shape[ndim] - k, input->shape[ndim - 1]) + : 0, + sub_diag_right_align ? get_offset(input->shape[ndim], input->shape[ndim - 1] + k) + : 0); + } + diagonal_indices.push_back(if_then_else(k >= 0, iter_vars[ndim - 1], iter_vars[ndim]) + + offset); return diagonal(diagonal_indices); }; - return if_then_else((PrimExpr)iter_vars[ndim] == iter_vars[ndim - 1], get_diag(), + return if_then_else((PrimExpr)iter_vars[ndim] - iter_vars[ndim - 1] >= k1, + if_then_else((PrimExpr)iter_vars[ndim] - iter_vars[ndim - 1] <= k2, + get_diag(), input(iter_vars)), input(iter_vars)); }, name, tag); diff --git a/pyproject.toml b/pyproject.toml index 2ea63215baee..8cf53c927c3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,9 +43,6 @@ exclude = ''' | src\/ | vta\/ | web\/ - )/|python/topi/python/test_topi_pooling.py - |python/tvm/topi/testing/pool3d_python.py - |tests/lint/add_asf_header.py - |tests/lint/check_file_type.py + )/ ) ''' diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py index eae8b2527b6e..d8763dbc78d9 100644 --- a/python/tvm/auto_scheduler/auto_schedule.py +++ b/python/tvm/auto_scheduler/auto_schedule.py @@ -199,7 +199,7 @@ def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()): Returns ------- - A `te.schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`. + A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`. """ if not isinstance(task, SearchTask): raise ValueError( diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py index 68883a0d06ff..0115dbcf8ebe 100755 --- a/python/tvm/auto_scheduler/compute_dag.py +++ b/python/tvm/auto_scheduler/compute_dag.py @@ -39,16 +39,16 @@ class ComputeDAG(Object): subgraph) to a ComputeDAG. It keeps the input/output tensors, all operations in the DAG, and some static analysis results for the DAG (e.g. the total float operation count, consumer/producer relations of operations, whether an operation stage should - be tiled/compute inlined ...). + be tiled/compute inlined). These analyses can help the search policy to make decisions during the search. ComputeDAG is also responsible for the interaction between auto-scheduler's `LoopState` and TVM schedule (e.g. applying the `LoopState` transform steps to a TVM schedule, providing - `LoopState` with extra information got from TVM schedule ...). + `LoopState` with extra information got from TVM schedule). Parameters ---------- compute : Union[List[Tensor], str] - `Tensor`s or workload key for a compute declaration. + Input/output tensors or workload key for a compute declaration. """ def __init__(self, compute): diff --git a/python/tvm/auto_scheduler/cost_model/cost_model.py b/python/tvm/auto_scheduler/cost_model/cost_model.py index 83c8463210a6..32e276b31c6a 100644 --- a/python/tvm/auto_scheduler/cost_model/cost_model.py +++ b/python/tvm/auto_scheduler/cost_model/cost_model.py @@ -41,9 +41,9 @@ def update(self, inputs, results): Parameters ---------- - inputs : List[MeasureInput] + inputs : List[auto_scheduler.measure.MeasureInput] The measurement inputs - results : List[MeasureResult] + results : List[auto_scheduler.measure.MeasureResult] The measurement results """ _ffi_api.CostModelUpdate(self, inputs, results) @@ -112,9 +112,9 @@ def update(self, inputs, results): Parameters ---------- - inputs : List[MeasureInput] + inputs : List[auto_scheduler.measure.MeasureInput] The measurement inputs - results : List[MeasureResult] + results : List[auto_scheduler.measure.MeasureResult] The measurement results """ raise NotImplementedError diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py index 15193684b09c..3eb64df693f2 100644 --- a/python/tvm/auto_scheduler/cost_model/xgb_model.py +++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py @@ -197,22 +197,26 @@ def predict(self, task, states): def predict_stages(self, task, states): """Predict the scores of all stages in states. This is the breakdown version of `predict`. + Parameters ---------- search_task : SearchTask The search task of states statse : List[State] The input states + Returns ------- scores: List[float] The predicted scores for all stages in all states in the packed format + Note ---- For faster data copy between c++ and python, the python part returns scores in a single flatten array using a packed format. The c++ part then unpacks the flatten array. The packed format is: { + float scores[N]; // scores[i] is the score for states[i]. int n_stage_0; // the number of stages in states[0] float stage_scores_0[[n_stage_0] // the scores for all stages in states[0] @@ -222,6 +226,7 @@ def predict_stages(self, task, states): int n_stage_i; // the number of stages in states[i] float stage_scores_1[n_stage_i]; // the scores for all stages in states[i] ... // untill i == N - 1 + } To implement this format, we also store int as float, so we can store all numbers into a single float array. diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index c57b39bb1d10..7648ebe92a65 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -25,8 +25,8 @@ A builder builds the executable binary files and a runner runs the binary files to get the measurement results. The flow of data structures is - `ProgramBuilder` `ProgramRunner` -`MeasureInput` -----------------> `BuildResult` ----------------> `MeasureResult` + . `ProgramBuilder` `ProgramRunner` + `MeasureInput` -----------------> `BuildResult` ----------------> `MeasureResult` We implement these in python to utilize python's multiprocessing and error handling. """ @@ -222,7 +222,7 @@ class LocalRunner(ProgramRunner): where the first "1" is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. - min_repeat_ms : int = 0 + min_repeat_ms : int = 100 The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the @@ -244,7 +244,7 @@ def __init__( timeout=10, number=3, repeat=1, - min_repeat_ms=0, + min_repeat_ms=100, cooldown_interval=0.0, enable_cpu_cache_flush=False, ): @@ -289,7 +289,7 @@ class RPCRunner(ProgramRunner): where the first "1" is warm up and will be discarded. The returned result contains `repeat` costs, each of which is an average of `number` costs. - min_repeat_ms : int = 0 + min_repeat_ms : int = 100 The minimum duration of one `repeat` in milliseconds. By default, one `repeat` contains `number` runs. If this parameter is set, the parameters `number` will be dynamically adjusted to meet the @@ -316,7 +316,7 @@ def __init__( timeout=10, number=3, repeat=1, - min_repeat_ms=0, + min_repeat_ms=100, cooldown_interval=0.0, enable_cpu_cache_flush=False, ): diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py index 0185d04e017c..c7ae19678a30 100644 --- a/python/tvm/auto_scheduler/measure_record.py +++ b/python/tvm/auto_scheduler/measure_record.py @@ -66,9 +66,9 @@ def read_lines(self, max_lines=None, skip_lines=0): Returns ------- - inputs : List[MeasureInput] + inputs : List[auto_scheduler.measure.MeasureInput] The MeasureInputs loaded from the log file. - results : List[MeasureResult] + results : List[auto_scheduler.measure.MeasureResult] The MeasureResults loaded from the log file. """ inputs, results = _ffi_api.RecordReaderReadLines( @@ -95,7 +95,7 @@ def load_records(filename): Returns ------- - logs : List[MeasureInput, MeasureResult] + logs : List[auto_scheduler.measure.MeasureInput, auto_scheduler.measure.MeasureResult] """ return zip(*RecordReader(filename).read_lines()) @@ -133,9 +133,9 @@ def load_best(filename, workload_key=None, target=None): Returns ------- - input : MeasureInput + input : auto_scheduler.measure.MeasureInput The best State's MeasureInput from this log fine. - result : MeasureResult + result : auto_scheduler.measure.MeasureResult The best State's MeasureResult from this log fine. """ log_reader = RecordReader(filename) diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py index a9d323622277..bf7e2eb16ae0 100644 --- a/python/tvm/auto_scheduler/search_policy.py +++ b/python/tvm/auto_scheduler/search_policy.py @@ -91,7 +91,7 @@ class SketchPolicy(SearchPolicy): ---------- task : SearchTask The SearchTask for the computation declaration. - schedule_cost_model : CostModel = RandomModel() + program_cost_model : CostModel = RandomModel() The cost model to estimate the complete schedules. params : Optional[Dict[str, Any]] Parameters of the search policy. @@ -105,9 +105,11 @@ class SketchPolicy(SearchPolicy): Callback functions called before the search process, usually used to do extra initializations. Possible callbacks: - - auto_scheduler.PreloadMeasuredStates - - auto_scheduler.PreloadCustomSketchRule - TODO(jcf94): Add these search callback implementations. + + - auto_scheduler.PreloadMeasuredStates + - auto_scheduler.PreloadCustomSketchRule + + TODO(jcf94): Add these search callback implementations. """ DEFAULT_PARAMS = { @@ -129,7 +131,7 @@ class SketchPolicy(SearchPolicy): def __init__( self, task, - schedule_cost_model=RandomModel(), + program_cost_model=RandomModel(), params=None, seed=None, verbose=1, @@ -145,7 +147,7 @@ def __init__( self.__init_handle_by_constructor__( _ffi_api.SketchPolicy, task, - schedule_cost_model, + program_cost_model, params, seed or random.randint(1, 1 << 30), verbose, diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py index f0c839800d70..1d9ee6da4f7a 100644 --- a/python/tvm/auto_scheduler/workload_registry.py +++ b/python/tvm/auto_scheduler/workload_registry.py @@ -55,13 +55,15 @@ def register_workload(func_name, f=None, override=False): Examples -------- - @auto_scheduler.register_workload - def matmul(N, M, K): - A = te.placeholder((N, K), name='A') - B = te.placeholder((K, M), name='B') - k = te.reduce_axis((0, K), name='k') - C = te.compute((N, M), lambda i, j: tvm.sum(A[i][k] * B[k][j], axis=[k]), name='C') - return [A, B, C] + .. code-block:: python + + @auto_scheduler.register_workload + def matmul(N, M, K): + A = te.placeholder((N, K), name='A') + B = te.placeholder((K, M), name='B') + k = te.reduce_axis((0, K), name='k') + C = te.compute((N, M), lambda i, j: tvm.sum(A[i][k] * B[k][j], axis=[k]), name='C') + return [A, B, C] """ global WORKLOAD_FUNC_REGISTRY diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index 6650f500a996..5a4e26b6d2ee 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -57,7 +57,7 @@ def measure_str_key(inp, include_config=True): Parameters ---------- - inp: MeasureInput + inp: autotvm.measure.MeasureInput input for the measure include_config: bool, optional whether includes config in the str key @@ -78,8 +78,8 @@ def encode(inp, result, protocol="json"): Parameters ---------- - inp: autotvm.tuner.MeasureInput - result: autotvm.tuner.MeasureResult + inp: autotvm.measure.MeasureInput + result: autotvm.measure.MeasureResult pair of input/result protocol: str log protocol, json or pickle @@ -135,7 +135,7 @@ def decode(row, protocol="json"): Returns ------- - ret : tuple(autotvm.tuner.MeasureInput, autotvm.tuner.MeasureResult), or None + ret : tuple(autotvm.measure.MeasureInput, autotvm.measure.MeasureResult), or None The tuple of input and result, or None if input uses old version log format. """ # pylint: disable=unused-variable @@ -204,8 +204,8 @@ def load_from_file(filename): Yields ------ - input: autotvm.tuner.MeasureInput - result: autotvm.tuner.MeasureResult + input: autotvm.measure.MeasureInput + result: autotvm.measure.MeasureResult """ for row in open(filename): if row and not row.startswith("#"): diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index bfc49d50350d..bed02581270e 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -184,7 +184,7 @@ class ApplyHistoryBest(DispatchContext): Parameters ---------- - records : str or iterator of (MeasureInput, MeasureResult) + records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) Collection of tuning records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. @@ -205,7 +205,7 @@ def load(self, records): Parameters ---------- - records : str or iterator of (MeasureInput, MeasureResult) + records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) Collection of tuning records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. @@ -382,7 +382,7 @@ def __init__(self, records): """ Parameters ---------- - records : str or iterator of (MeasureInput, MeasureResult) + records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) Collection of tuning records. If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index d700b64e5fd5..fb8cf57ed7c7 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -1078,7 +1078,7 @@ def fallback_with_reference_log(self, ref_log): Parameters ---------- - ref_log: List of (MeasureInput, MeasureResult) + ref_log: List of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) The reference log """ knob_names = [x for x in self.space_map.keys() if isinstance(self.space_map[x], SplitSpace)] diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index cbfe97328d67..9864ba005197 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -188,7 +188,7 @@ def load_history(self, data_set): Parameters ---------- - data_set: Array of (MeasureInput, MeasureResult) pair + data_set: Array of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult) pair Previous tuning records """ raise NotImplementedError() diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py index 6275f779f778..0ed88216738b 100644 --- a/python/tvm/driver/tvmc/frontends.py +++ b/python/tvm/driver/tvmc/frontends.py @@ -154,6 +154,7 @@ def load(self, path): # pylint: disable=C0415 import onnx + # pylint: disable=E1101 model = onnx.load(path) # pylint: disable=E1101 diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py index 084f46716e15..3565040e1d76 100644 --- a/python/tvm/micro/session.py +++ b/python/tvm/micro/session.py @@ -22,10 +22,14 @@ from .._ffi import get_global_func from ..contrib import graph_runtime -from .base import _rpc_connect from ..rpc import RPCSession from .transport import TransportLogger +try: + from .base import _rpc_connect +except ImportError: + raise ImportError("micro tvm is not enabled. Set USE_MICRO to ON in config.cmake") + class Session: """MicroTVM Device Session diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py index e4d605aa4560..027d6bd76141 100644 --- a/python/tvm/relay/frontend/common.py +++ b/python/tvm/relay/frontend/common.py @@ -563,6 +563,23 @@ def infer_value_simulated(input_val, params): return output_value +def try_infer_value(val, on_success=None, on_failure=None): + """Try running infer_value on the input val, and if successful, return the inferred value or + pass it to on_success callback if provided. Otherwise, run on_failure callback if it is + provided, or return the input val as output. In each case, the second return value + indicates whether infer_value has succeeded or not. + """ + try: + ret = infer_value(val, {}).asnumpy() + if on_success: + return on_success(ret), True + return ret, True + except Exception: + if on_failure: + return on_failure(), False + return val, False + + def new_var(name_hint, type_annotation=None, shape=None, dtype="float32"): return _expr.var(name_hint, type_annotation, shape, dtype) diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 467dd4fce2af..40ad44961405 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -748,12 +748,13 @@ class Prelu(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(len(inputs)) + input_channels = infer_shape(inputs[0])[1] alpha_shape = infer_shape(inputs[1]) if len(alpha_shape) != 1: alpha = _op.reshape(inputs[1], (-1,)) else: alpha = inputs[1] - return _op.nn.prelu(inputs[0], alpha) + return _op.nn.prelu(inputs[0], _op.broadcast_to(alpha, [input_channels])) class Reciprocal(OnnxOpConverter): diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 9ceb9fc66ec4..063158f35dc3 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -16,7 +16,7 @@ # under the License. # pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks # pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except -# pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension +# pylint: disable=import-outside-toplevel, simplifiable-if-expression, cell-var-from-loop, unnecessary-lambda """PT: PyTorch frontend.""" import itertools import logging @@ -36,6 +36,7 @@ from .common import AttrCvt, get_relay_op from .common import infer_shape as _infer_shape from .common import infer_value as _infer_value +from .common import try_infer_value from .common import infer_value_simulated as _infer_value_simulated from .common import infer_type as _infer_type from ..prelude import Prelude, StaticTensorArrayOps @@ -185,11 +186,8 @@ def _impl(inputs, input_types): def _get_value(val, dtype): # dtype is a tvm dtype if isinstance(val, _expr.Expr): - try: - ret = _infer_value(_op.cast(val, dtype), {}).asnumpy() - ret = _expr.const(ret, dtype) - except Exception: - ret = _op.cast(val, dtype) + inp = _op.cast(val, dtype) + ret, _ = try_infer_value(inp, lambda ret: _expr.const(ret, dtype)) else: ret = _create_typed_const(val, dtype) return ret @@ -305,10 +303,7 @@ def _impl(inputs, input_types): dim = int(inputs[1]) stride = int(inputs[4]) if isinstance(inputs[2], _expr.Call): - try: - begin[dim] = np.asscalar(_infer_value(inputs[2], {}).asnumpy().astype(np.int)) - except Exception: - begin[dim] = inputs[2] + begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int))) else: begin[dim] = int(inputs[2]) @@ -329,10 +324,9 @@ def _impl(inputs, input_types): target_end = int(inputs[3]) else: if isinstance(inputs[3], _expr.Expr): - try: - target_end = np.asscalar(_infer_value(inputs[3], {}).asnumpy().astype(np.int)) - except Exception: - target_end = inputs[3] + target_end, _ = try_infer_value( + inputs[3], lambda ret: np.asscalar(ret.astype(np.int)) + ) else: target_end = inputs[3] @@ -457,10 +451,7 @@ def _impl(inputs, input_types): sort = bool(inputs[4]) if isinstance(inputs[1], _expr.Expr): - try: - k = _infer_value(inputs[1], {}).asnumpy().tolist() - except Exception: - k = inputs[1] + k, _ = try_infer_value(inputs[1], lambda ret: ret.tolist()) else: k = inputs[1] @@ -546,15 +537,15 @@ def _full_impl(data, fill_value, dtype): size.append(dim) new_shape.append(dim) else: - try: - dim = int(_infer_value(dim, {}).asnumpy()) + dim, success = try_infer_value(dim, lambda ret: int(ret), lambda: 0) + new_shape.append(dim) + + if success: if isinstance(size, list): size.append(dim) - new_shape.append(dim) - except Exception: + else: size = None need_reshape = True - new_shape.append(0) else: if isinstance(size, list): size.append(dim) @@ -1346,12 +1337,11 @@ def _impl(inputs, input_types): if isinstance(s, _expr.Constant): tmp_shape.append(int(s.data.asnumpy())) elif isinstance(s, _expr.Expr): - try: - dim = int(_infer_value(s, {}).asnumpy()) - tmp_shape.append(dim) - except Exception: + dim, success = try_infer_value(s, lambda ret: int(ret)) + tmp_shape.append(dim) + + if not success: is_dyn = True - tmp_shape.append(s) else: tmp_shape.append(s) @@ -1822,13 +1812,14 @@ def _impl(inputs, input_types): # 6 means dtype = float # this happens when converting upsampling with scale factor cast_map = { + 5: "float16", 6: "float32", 7: "float64", 3: "int32", 4: "int64", } - cast_func = {6: float, 7: float, 3: int, 4: int} + cast_func = {5: float, 6: float, 7: float, 3: int, 4: int} ret = data if isinstance(data, _expr.Expr): @@ -2312,13 +2303,15 @@ def _impl(inputs, input_types): if isinstance(inputs[1], _expr.Expr): out_size = inputs[1] elif isinstance(inputs[1], list): - try: - infer_res = [_infer_value(size, {}) for size in inputs[1]] - out_size = [np.asscalar(res.asnumpy().astype(np.int)) for res in infer_res] - except Exception: - h = _op.expand_dims(inputs[1][0], axis=0) - w = _op.expand_dims(inputs[1][1], axis=0) - out_size = _op.concatenate([h, w], axis=0) + out_size = [] + for i in [0, 1]: + size, _ = try_infer_value( + inputs[1][i], + lambda ret: ret.astype(np.int), + lambda: _op.expand_dims(inputs[1][i], axis=0), + ) + out_size.append(size) + out_size = _op.concatenate(out_size, axis=0) data = inputs[0] align_corners = inputs[4] @@ -2429,21 +2422,21 @@ def _convert_data_type(input_type, default_dtype=None): return default_dtype input_type = input_type.lower() - if input_type in ["double", "torch.float64"]: + if input_type in ["double", "float64", "torch.float64"]: return "float64" - elif input_type in ["float", "torch.float32"]: + elif input_type in ["float", "float32", "torch.float32"]: return "float32" - elif input_type in ["half", "torch.float16"]: + elif input_type in ["half", "float16", "torch.float16"]: return "float16" - elif input_type in ["long", "torch.int64"]: + elif input_type in ["long", "int64", "torch.int64"]: return "int64" - elif input_type in ["int", "torch.int32"]: + elif input_type in ["int", "int32", "torch.int32"]: return "int32" - elif input_type in ["short", "torch.int16"]: + elif input_type in ["short", "int16", "torch.int16"]: return "int16" - elif input_type in ["char", "torch.int8"]: + elif input_type in ["char", "int8", "torch.int8"]: return "int8" - elif input_type in ["byte", "torch.uint8"]: + elif input_type in ["byte", "uint8", "torch.uint8"]: return "uint8" elif input_type in ["quint8", "torch.quint8"]: return "quint8" @@ -2851,7 +2844,7 @@ def _get_operator_nodes(nodes): return ops -def _get_relay_input_vars(graph, input_shapes, prelude, is_module=True, default_dtype="float32"): +def _get_relay_input_vars(graph, input_infos, prelude, is_module=True, default_dtype="float32"): """ Return Relay vars from input shapes and create entries based on expected graph inputs - to allow translation @@ -2862,17 +2855,17 @@ def _get_relay_input_vars(graph, input_shapes, prelude, is_module=True, default_ # a module has "self" as first input, which we do not need/want graph_inputs = graph_inputs[1:] - if not isinstance(input_shapes, list): - msg = "Graph inputs input_shapes should be a list" + if not isinstance(input_infos, list): + msg = "Graph inputs input_infos should be a list" raise RuntimeError(msg) - if len(graph_inputs) != len(input_shapes): - msg = "PyTorch has {} inputs and input_shapes lists {}.".format( - len(graph_inputs), len(input_shapes) + if len(graph_inputs) != len(input_infos): + msg = "PyTorch has {} inputs and input_infos lists {}.".format( + len(graph_inputs), len(input_infos) ) raise RuntimeError(msg) - def get_relay_ty(ishape, pt_type): + def get_relay_ty(ishape, itype, pt_type): if pt_type.kind() == "TensorType": if not (_is_int_seq(ishape) or len(ishape) == 0): msg = "Shape for Tensors must be lists of ints" @@ -2884,6 +2877,8 @@ def get_relay_ty(ishape, pt_type): msg = "Shapes of input list and information in the graph do not match" raise RuntimeError(msg) pt_dtype = pt_type.scalarType() + if not pt_dtype and itype: + pt_dtype = itype dtype = _convert_data_type(pt_dtype, default_dtype=default_dtype) return TensorType(ishape, dtype) elif pt_type.kind() == "TupleType": @@ -2891,37 +2886,45 @@ def get_relay_ty(ishape, pt_type): msg = "Shapes for tuples must be tuples" raise RuntimeError(msg) return TupleType( - [get_relay_ty(elem, pt_t) for elem, pt_t in zip(ishape, pt_type.elements())] + [get_relay_ty(elem, itype, pt_t) for elem, pt_t in zip(ishape, pt_type.elements())] ) elif pt_type.kind() == "ListType": if not isinstance(ishape, list): msg = "Shapes for lists must be lists" raise RuntimeError(msg) pt_elemtype = pt_type.getElementType() - elem_tys = [get_relay_ty(s, pt_elemtype) for s in ishape] + elem_tys = [get_relay_ty(s, itype, pt_elemtype) for s in ishape] if len(elem_tys) > 0 and not all(map(lambda ty: ty == elem_tys[0], elem_tys)): msg = "List elements need have identical types" raise RuntimeError(msg) return prelude.l(elem_tys[0]) elif pt_type.kind() == "OptionalType": # we do not support None yet, so we fill in the type - return get_relay_ty(ishape, pt_type.getElementType()) + return get_relay_ty(ishape, itype, pt_type.getElementType()) # TODO: scalar inputs raise NotImplementedError("unsupported input type") input_vars = {} - for num, inp in enumerate(input_shapes): + new_input_infos = [] + for num, inp in enumerate(input_infos): if not isinstance(inp, tuple): msg = "Graph input {} is not a tuple".format(num) raise RuntimeError(msg) if len(inp) != 2 or not isinstance(inp[0], str): - msg = "Graph input {} is not valid, expected ('name', shape)".format(inp) + msg = ( + "Graph input {} is not valid," + " expected ('name', shape) or ('name', (shape, dtype))".format(inp) + ) raise RuntimeError(msg) + if not isinstance(inp[1], tuple) or len(inp[1]) == 0 or not isinstance(inp[1][-1], str): + new_input_infos.append((inp[0], (inp[1], default_dtype))) + else: + new_input_infos.append(inp) input_types = [ - (name, get_relay_ty(shape, gi.type())) - for (name, shape), gi in zip(input_shapes, graph_inputs) + (name, get_relay_ty(info[0], info[1], gi.type())) + for (name, info), gi in zip(new_input_infos, graph_inputs) ] ir_inputs = [i.debugName() for i in graph_inputs] @@ -3252,7 +3255,7 @@ def get_all_op_names(graph): return set(node.kind() for node in nodes) -def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_dtype="float32"): +def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dtype="float32"): """Load PyTorch model in the form of a scripted PyTorch model and convert into relay. The companion parameters will be handled automatically. @@ -3262,10 +3265,15 @@ def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_d TorchScripted PyTorch graph Note: We currently only support traces (ie: torch.jit.trace(model, input)) - input_shapes : List of tuples of input name and input dimensions - Graph level input shape list + input_infos: List of tuples of (input name, input shape) + or (input name, (input shape, input types)) + Graph level input shape and type list The same input names need to be used for deployment, so choose easy to remember names (such as: input0, input1) + e.g. + [('input0', (1, 2)), ('input1', (3, 4))] + or + [('input0', ((1, 2), 'int')), ('input1', ((3, 4), 'float'))] custom_convert_map: Dictionary of str to Relay op A custom op conversion map in the same format as _convert_map above @@ -3297,7 +3305,7 @@ def from_pytorch(script_module, input_shapes, custom_convert_map=None, default_d is_module = isinstance(script_module, torch.jit.ScriptModule) params = script_module.state_dict() if is_module else {} outputs = _get_relay_input_vars( - graph, input_shapes, prelude, default_dtype=default_dtype, is_module=is_module + graph, input_infos, prelude, default_dtype=default_dtype, is_module=is_module ) param_vars, tensors, packed_param_map = convert_params(graph, params) tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()} diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index 0c7df75489ae..77fdbbd4006c 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -337,3 +337,11 @@ def global_avg_pool2d(attrs, args): if attrs.layout != "NHWC": return False return True + + +@tvm.ir.register_op_attr("maximum", "target.arm_compute_lib") +def maximum(attrs, args): + """Check if the external ACL codegen for maximum should be used.""" + type_a = args[0].checked_type + type_b = args[0].checked_type + return (type_a.dtype == "float32") and (type_b.dtype == "float32") diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 086603550532..c83f6a943a31 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -668,7 +668,7 @@ def compute_cross_entropy(attrs, inputs, out_dtype): # dilate @reg.register_compute("nn.dilate") def compute_dilate(attrs, inputs, out_dtype): - return [topi.nn.dilate(inputs[0], attrs.strides)] + return [topi.nn.dilate(inputs[0], attrs.strides, attrs.dilation_value)] reg.register_broadcast_schedule("nn.dilate") diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 853cd4240b48..86a76ff28fa5 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -1549,23 +1549,26 @@ def pad(data, pad_width, pad_value=0, pad_mode="constant"): return _make.pad(data, pad_width, pad_value, pad_mode) -def dilate(data, strides): - """Dilate data with zeros. +def dilate(data, strides, dilation_value=0.0): + """Dilate data with given dilation value (0 by default). Parameters ---------- data : tvm.relay.Expr n-D, can be any layout. - strides : + strides : tuple of Dilation stride on each dimension, 1 means no dilation. + dilation_value : int/float, optional + Value used to dilate the input. + Returns ------- Output : tvm.relay.Expr The computed result """ - return _make.dilate(data, strides) + return _make.dilate(data, strides, dilation_value) def mirror_pad(data, pad_width, mode="SYMMETRIC"): diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 1833dfe2075d..54624ce35b55 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -204,6 +204,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic", ) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + logger.warning("group_conv2d with layout NHWC is not optimized for arm cpu.") + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu".format(layout)) return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 393854ea2536..56ae97652b79 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -256,6 +256,13 @@ def conv2d_strategy(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic", ) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index bbe01f4feb6c..e2a82d396b22 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -175,6 +175,14 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic", ) + elif layout == "NHWC": + assert kernel_layout == "HWIO" + logger.warning("group_conv2d is not optimized for x86.") + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), + name="group_conv2d_nhwc.generic", + ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index 8ccd14890581..14ac454aec64 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -1178,17 +1178,33 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0 return _make.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value) -def matrix_set_diag(data, diagonal): +def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"): """ - Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values. + Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values. Parameters ---------- data : relay.Expr Input Tensor. + diagonal : relay.Expr Values to be filled in the diagonal. + k : int or tuple of int, optional + Diagonal Offset(s). The diagonal or range of diagonals to set. (0 by default) + Positive value means superdiagonal, 0 refers to the main diagonal, and + negative value means subdiagonals. k can be a single integer (for a single diagonal) + or a pair of integers specifying the low and high ends of a matrix band. + k[0] must not be larger than k[1]. + + align : string, optional + Some diagonals are shorter than max_diag_len and need to be padded. + align is a string specifying how superdiagonals and subdiagonals should be aligned, + respectively. There are four possible alignments: "RIGHT_LEFT" (default), "LEFT_RIGHT", + "LEFT_LEFT", and "RIGHT_RIGHT". "RIGHT_LEFT" aligns superdiagonals to the right + (left-pads the row) and subdiagonals to the left (right-pads the row). It is the packing + format LAPACK uses. cuSPARSE uses "LEFT_RIGHT", which is the opposite alignment. + Returns ------- result : relay.Expr @@ -1216,7 +1232,22 @@ def matrix_set_diag(data, diagonal): [7, 5, 7, 7], [7, 7, 6, 7]]] """ - return _make.matrix_set_diag(data, diagonal) + if isinstance(k, (tuple, list)): + k_one = k[0] + if len(k) >= 2: + k_two = k[1] + else: + k_two = k[0] + else: + k_one = k + k_two = k + + super_diag_right_align = align[:5] == "RIGHT" + sub_diag_right_align = align[-5:] == "RIGHT" + + return _make.matrix_set_diag( + data, diagonal, k_one, k_two, super_diag_right_align, sub_diag_right_align + ) def adv_index(inputs): diff --git a/python/tvm/relay/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py index c457ce3dd01d..a387f3076bf5 100644 --- a/python/tvm/relay/testing/yolo_detection.py +++ b/python/tvm/relay/testing/yolo_detection.py @@ -196,41 +196,91 @@ def do_nms_sort(dets, classes, thresh): dets[j]["prob"][k] = 0 +def get_detections(im, det, thresh, names, classes): + "Draw the markings around the detected region" + labelstr = [] + category = -1 + detection = None + valid = False + for j in range(classes): + if det["prob"][j] > thresh: + if category == -1: + category = j + labelstr.append(names[j] + " " + str(round(det["prob"][j], 4))) + + if category > -1: + valid = True + imc, imh, imw = im.shape + width = int(imh * 0.006) + offset = category * 123457 % classes + red = _get_color(2, offset, classes) + green = _get_color(1, offset, classes) + blue = _get_color(0, offset, classes) + rgb = [red, green, blue] + b = det["bbox"] + left = int((b.x - b.w / 2.0) * imw) + right = int((b.x + b.w / 2.0) * imw) + top = int((b.y - b.h / 2.0) * imh) + bot = int((b.y + b.h / 2.0) * imh) + + if left < 0: + left = 0 + if right > imw - 1: + right = imw - 1 + if top < 0: + top = 0 + if bot > imh - 1: + bot = imh - 1 + + detection = { + "category": category, + "labelstr": labelstr, + "left": left, + "top": top, + "right": right, + "bot": bot, + "width": width, + "rgb": rgb, + } + + return valid, detection + + def draw_detections(font_path, im, dets, thresh, names, classes): "Draw the markings around the detected region" for det in dets: - labelstr = [] - category = -1 - for j in range(classes): - if det["prob"][j] > thresh: - if category == -1: - category = j - labelstr.append(names[j] + " " + str(round(det["prob"][j], 4))) - if category > -1: - imc, imh, imw = im.shape - width = int(imh * 0.006) - offset = category * 123457 % classes - red = _get_color(2, offset, classes) - green = _get_color(1, offset, classes) - blue = _get_color(0, offset, classes) - rgb = [red, green, blue] - b = det["bbox"] - left = int((b.x - b.w / 2.0) * imw) - right = int((b.x + b.w / 2.0) * imw) - top = int((b.y - b.h / 2.0) * imh) - bot = int((b.y + b.h / 2.0) * imh) - - if left < 0: - left = 0 - if right > imw - 1: - right = imw - 1 - if top < 0: - top = 0 - if bot > imh - 1: - bot = imh - 1 - _draw_box_width(im, left, top, right, bot, width, red, green, blue) - label = _get_label(font_path, "".join(labelstr), rgb) - _draw_label(im, top + width, left, label, rgb) + valid, detection = get_detections(im, det, thresh, names, classes) + if valid: + rgb = detection["rgb"] + label = _get_label(font_path, "".join(detection["labelstr"]), rgb) + _draw_box_width( + im, + detection["left"], + detection["top"], + detection["right"], + detection["bot"], + detection["width"], + rgb[0], + rgb[1], + rgb[2], + ) + _draw_label(im, detection["top"] + detection["width"], detection["left"], label, rgb) + + +def show_detections(im, dets, thresh, names, classes): + "Print the markings and the detected region" + for det in dets: + valid, detection = get_detections(im, det, thresh, names, classes) + if valid: + print( + "class:{} left:{} right:{} top:{} bottom:{}".format( + detection["labelstr"], + detection["left"], + detection["top"], + detection["right"], + detection["bot"], + ) + ) def _get_pixel(im, x, y, c): diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py index 2b5624929bec..4bc3f97d850b 100644 --- a/python/tvm/topi/generic/nn.py +++ b/python/tvm/topi/generic/nn.py @@ -428,6 +428,23 @@ def schedule_group_conv2d_nchw(outs): return _default_schedule(outs, False) +def schedule_group_conv2d_nhwc(outs): + """Schedule for group_conv2d_nhwc + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of group_conv2d_nhwc + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + def schedule_deformable_conv2d_nchw(outs): """Schedule for deformable_conv2d_nchw diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 524558441b21..c0e941c4acc7 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -827,6 +827,91 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp ) +def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtype=None): + """Group convolution operator in NHWC layout. + + Parameters + ---------- + Input : tvm.te.Tensor + 4-D with shape [batch, in_height, in_width, in_channel] + + Filter : tvm.te.Tensor + 4-D with shape [filter_height, filter_width, in_channel // groups, num_filter] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of 2 or 4 ints + padding size, or + [pad_height, pad_width] for 2 ints, or + [pad_top, pad_left, pad_bottom, pad_right] for 4 ints + + dilation : int or a list/tuple of two ints + dilation size, or [dilation_height, dilation_width] + + groups : int + number of groups + + out_dtype : str + The output type. This is used for mixed precision. + + Returns + ------- + Output : tvm.te.Tensor + 4-D with shape [batch, out_height, out_width, out_channel] + """ + if out_dtype is None: + out_dtype = Input.dtype + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + batch, in_height, in_width, in_channel = get_const_tuple(Input.shape) + kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape) + + assert in_channel % groups == 0, "input channels must divide group size" + assert num_filter % groups == 0, "output channels must divide group size" + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) + # compute the output shape + out_channel = num_filter + out_height = simplify( + (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 + ) + out_width = simplify( + (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 + ) + # compute graph + pad_before = [0, pad_top, pad_left, 0] + pad_after = [0, pad_down, pad_right, 0] + temp = pad(Input, pad_before, pad_after, name="pad_temp") + ry = te.reduce_axis((0, kernel_h), name="ry") + rx = te.reduce_axis((0, kernel_w), name="rx") + rc = te.reduce_axis((0, in_channel // groups), name="rc") + return te.compute( + (batch, out_height, out_width, out_channel), + lambda nn, yy, xx, ff: te.sum( + temp[ + nn, + yy * stride_h + ry * dilation_h, + xx * stride_w + rx * dilation_w, + ff // (num_filter // groups) * (in_channel // groups) + rc, + ].astype(out_dtype) + * Filter[ry, rx, rc, ff].astype(out_dtype), + axis=[ry, rx, rc], + ), + tag="group_conv2d_nhwc", + ) + + def unpack_NCHWc_to_nchw(packed_out, out_dtype): """Unpack conv2d_NCHWc output from layout NCHWc to NCHW diff --git a/python/tvm/topi/nn/conv2d_transpose.py b/python/tvm/topi/nn/conv2d_transpose.py index d1edbaae9c7f..f67f9c9c8a5a 100644 --- a/python/tvm/topi/nn/conv2d_transpose.py +++ b/python/tvm/topi/nn/conv2d_transpose.py @@ -97,11 +97,11 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype, # convolution stage out_c = simplify(out_c) - out_h = simplify(in_h - filter_h + 1 + output_padding[0]) - out_w = simplify(in_w - filter_w + 1 + output_padding[1]) - dc = tvm.reduce_axis((0, in_c), name="dc") - dh = tvm.reduce_axis((0, filter_h), name="dh") - dw = tvm.reduce_axis((0, filter_w), name="dw") + out_h = simplify(in_h - filter_h + 1) + out_w = simplify(in_w - filter_w + 1) + dc = te.reduce_axis((0, in_c), name="dc") + dh = te.reduce_axis((0, filter_h), name="dh") + dw = te.reduce_axis((0, filter_w), name="dw") Output = te.compute( (batch, out_c, out_h, out_w), diff --git a/python/tvm/topi/nn/dilate.py b/python/tvm/topi/nn/dilate.py index 836e29a6812d..6980fea58173 100644 --- a/python/tvm/topi/nn/dilate.py +++ b/python/tvm/topi/nn/dilate.py @@ -23,8 +23,8 @@ @te.tag_scope(tag=tag.INJECTIVE + ",dilate") -def dilate(data, strides, name="DilatedInput"): - """Dilate data with zeros. +def dilate(data, strides, dilation_value=0.0, name="DilatedInput"): + """Dilate data with given dilation value (0 by default). Parameters ---------- @@ -34,6 +34,9 @@ def dilate(data, strides, name="DilatedInput"): strides : list / tuple of n ints Dilation stride on each dimension, 1 means no dilation. + dilation_value : int/float, optional + Value used to dilate the input. + name : str, optional The name prefix operators generated @@ -62,7 +65,7 @@ def _dilate(*indices): if not_zero: not_zero = tvm.tir.all(*not_zero) return tvm.tir.if_then_else( - not_zero, data(*index_tuple), tvm.tir.const(0.0, data.dtype) + not_zero, data(*index_tuple), tvm.tir.const(dilation_value, data.dtype) ) return data(*index_tuple) diff --git a/python/tvm/topi/testing/dilate_python.py b/python/tvm/topi/testing/dilate_python.py index b4fff24a1d43..0ae611559729 100644 --- a/python/tvm/topi/testing/dilate_python.py +++ b/python/tvm/topi/testing/dilate_python.py @@ -19,7 +19,7 @@ import numpy as np -def dilate_python(input_np, strides): +def dilate_python(input_np, strides, dilation_value=0.0): """Dilate operation. Parameters @@ -30,6 +30,9 @@ def dilate_python(input_np, strides): strides : list / tuple of n ints Dilation stride on each dimension, 1 means no dilation. + dilation_value : int/float, optional + Value used to dilate the input. + Returns ------- output_np : numpy.ndarray @@ -45,7 +48,8 @@ def dilate_python(input_np, strides): for i in range(n): output_size += ((input_np.shape[i] - 1) * strides[i] + 1,) no_zero += ((range(0, output_size[i], strides[i])),) - output_np = np.zeros(shape=output_size) + output_np = np.ones(shape=output_size) + output_np = dilation_value * output_np output_np[np.ix_(*no_zero)] = input_np return output_np diff --git a/python/tvm/topi/testing/matrix_set_diag.py b/python/tvm/topi/testing/matrix_set_diag.py index 63edd0a6d637..81a8f6cccafe 100644 --- a/python/tvm/topi/testing/matrix_set_diag.py +++ b/python/tvm/topi/testing/matrix_set_diag.py @@ -19,20 +19,28 @@ import numpy as np -def matrix_set_diag(input_np, diagonal): +def matrix_set_diag(input_np, diagonal, k=0, align="RIGHT_LEFT"): """matrix_set_diag operator implemented in numpy. - Returns a numpy array with the diagonal of input array + Returns a numpy array with the diagonals of input array replaced with the provided diagonal values. Parameters ---------- - input : numpy.ndarray + input_np : numpy.ndarray Input Array. Shape = [D1, D2, D3, ... , Dn-1 , Dn] + diagonal : numpy.ndarray Values to be filled in the diagonal. - Shape = [D1, D2, D3, ... , Dn-1] + + k : int or tuple of int + Diagonal Offsets. + + align : string + Some diagonals are shorter than max_diag_len and need to be padded. + Possible Vales: + ["RIGHT_LEFT" (default), "LEFT_RIGHT", "LEFT_LEFT", "RIGHT_RIGHT"] Returns ------- @@ -41,8 +49,36 @@ def matrix_set_diag(input_np, diagonal): Shape = [D1, D2, D3, ... , Dn-1 , Dn] """ out = np.array(input_np, copy=True) - n = min(input_np.shape[-1], input_np.shape[-2]) - for i in range(n): - out[..., i, i] = diagonal[..., i] + cols = input_np.shape[-1] + rows = input_np.shape[-2] + + onlyOneDiagonal = True + if isinstance(k, (tuple, list)): + if len(k) < 2 or k[0] == k[1]: + k = k[0] + else: + onlyOneDiagonal = False + + if onlyOneDiagonal: + for i in range(diagonal.shape[-1]): + if k >= 0: + out[..., i, i + k] = diagonal[..., i] + else: + out[..., i - k, i] = diagonal[..., i] + else: + for ki in range(k[0], k[1] + 1): + diag_len = min(cols - max(ki, 0), rows + min(ki, 0)) + offset = 0 + if ki >= 0: + if align[:5] == "RIGHT": + offset = diagonal.shape[-1] - diag_len + else: + if align[-5:] == "RIGHT": + offset = diagonal.shape[-1] - diag_len + for i in range(diag_len): + if ki >= 0: + out[..., i, i + ki] = diagonal[..., k[1] - ki, i + offset] + else: + out[..., i - ki, i] = diagonal[..., k[1] - ki, i + offset] return out diff --git a/python/tvm/topi/testing/pool3d_python.py b/python/tvm/topi/testing/pool3d_python.py index 457c4015a405..8c687f737166 100644 --- a/python/tvm/topi/testing/pool3d_python.py +++ b/python/tvm/topi/testing/pool3d_python.py @@ -20,12 +20,20 @@ import numpy as np import tvm -def pool3d_ncdhw_python(np_data, kernel, - strides, padding, - out_shape, pool_type, - count_include_pad=True, - ceil_mode=False, dtype="float32"): + +def pool3d_ncdhw_python( + np_data, + kernel, + strides, + padding, + out_shape, + pool_type, + count_include_pad=True, + ceil_mode=False, + dtype="float32", +): """baseline for max_pool3d and avg_pool3d, default layout is "NCDHW""" + # fmt: off in_n, in_c, in_d, in_h, in_w = in_shape = np_data.shape if isinstance(kernel, int): k_d = k_h = k_w = kernel @@ -99,4 +107,5 @@ def pool3d_ncdhw_python(np_data, kernel, raise ValueError("pool type {} is not supported".format(pool_type)) ret_np = np.maximum(ret_np, fill_value) + # fmt: on return ret_np diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py index 6af0828da448..c4e51a8858d1 100644 --- a/python/tvm/topi/transform.py +++ b/python/tvm/topi/transform.py @@ -806,17 +806,33 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0 return cpp.sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value) -def matrix_set_diag(data, diagonal): +def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"): """ - Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values. + Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values. Parameters ---------- data : relay.Expr Input Tensor. + diagonal : relay.Expr Values to be filled in the diagonal. + k : int or tuple of int, optional + Diagonal Offset(s). The diagonal or range of diagonals to set. (0 by default) + Positive value means superdiagonal, 0 refers to the main diagonal, and + negative value means subdiagonals. k can be a single integer (for a single diagonal) + or a pair of integers specifying the low and high ends of a matrix band. + k[0] must not be larger than k[1]. + + align : string, optional + Some diagonals are shorter than max_diag_len and need to be padded. + align is a string specifying how superdiagonals and subdiagonals should be aligned, + respectively. There are four possible alignments: "RIGHT_LEFT" (default), "LEFT_RIGHT", + "LEFT_LEFT", and "RIGHT_RIGHT". "RIGHT_LEFT" aligns superdiagonals to the right + (left-pads the row) and subdiagonals to the left (right-pads the row). It is the packing + format LAPACK uses. cuSPARSE uses "LEFT_RIGHT", which is the opposite alignment. + Returns ------- result : relay.Expr @@ -836,7 +852,7 @@ def matrix_set_diag(data, diagonal): diagonal = [[1, 2, 3], [4, 5, 6]] - relay.matrix_set_diag(input, diagonal) = + topi.matrix_set_diag(input, diagonal) = [[[1, 7, 7, 7], [7, 2, 7, 7], [7, 7, 3, 7]], @@ -844,7 +860,22 @@ def matrix_set_diag(data, diagonal): [7, 5, 7, 7], [7, 7, 6, 7]]] """ - return cpp.matrix_set_diag(data, diagonal) + if isinstance(k, (tuple, list)): + k_one = k[0] + if len(k) >= 2: + k_two = k[1] + else: + k_two = k[0] + else: + k_one = k + k_two = k + + super_diag_right_align = align[:5] == "RIGHT" + sub_diag_right_align = align[-5:] == "RIGHT" + + return cpp.matrix_set_diag( + data, diagonal, k_one, k_two, super_diag_right_align, sub_diag_right_align + ) def adv_index(data, indices): diff --git a/rust/tvm-sys/src/context.rs b/rust/tvm-sys/src/context.rs index 64b58b9f42c9..3747bfcba314 100644 --- a/rust/tvm-sys/src/context.rs +++ b/rust/tvm-sys/src/context.rs @@ -248,6 +248,18 @@ impl Display for Context { } } +impl<'a> From<&'a Context> for ArgValue<'a> { + fn from(ctx: &'a Context) -> Self { + DLContext::from(ctx).into() + } +} + +impl<'a> From for ArgValue<'a> { + fn from(ctx: Context) -> Self { + DLContext::from(ctx).into() + } +} + impl From for RetValue { fn from(ret_value: Context) -> RetValue { RetValue::Context(ret_value.into()) diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py index 14e2eee9c48a..904f244e0a9a 100644 --- a/rust/tvm/examples/resnet/src/build_resnet.py +++ b/rust/tvm/examples/resnet/src/build_resnet.py @@ -74,8 +74,9 @@ def build(target_dir): block = get_model("resnet18_v1", pretrained=True) net, params = relay.frontend.from_mxnet(block, {"data": data_shape}) # we want a probability so add a softmax operator + func = net["main"] net = relay.Function( - net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs ) else: # use random weights from relay.testing diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc index fbb52a9ebe7a..876b7db188c6 100644 --- a/src/arith/const_int_bound.cc +++ b/src/arith/const_int_bound.cc @@ -85,7 +85,7 @@ struct ConstIntBoundAnalyzer::Entry { class ConstIntBoundAnalyzer::Impl : public ExprFunctor { public: - /*! \brief additional bound info about expr \in bound */ + /*! \brief additional bound info about expr in bound */ struct BoundInfo { /*! \brief The expr */ PrimExpr expr; diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc index 6c8498fa1912..a7aded6bc7e5 100644 --- a/src/arith/rewrite_simplify.cc +++ b/src/arith/rewrite_simplify.cc @@ -1535,7 +1535,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) { } else if (op->op.same_as(tir::builtin::shift_left())) { if (op->args[0].as() && op->args[1].as()) { // the operator overload will eagerly constant fold. - return op->args[0] & op->args[1]; + return op->args[0] << op->args[1]; } } ExprDeepEqual expr_equal; diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc index ffc00941143c..a89fa4b0c77a 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.cc +++ b/src/auto_scheduler/search_policy/sketch_policy.cc @@ -27,10 +27,12 @@ #include "sketch_policy.h" #include +#include #include #include #include +#include #include #include #include @@ -45,7 +47,6 @@ namespace tvm { namespace auto_scheduler { /********** Sketch generation rules **********/ - static RuleSkipStage rule_skip_stage; static RuleAlwaysInline rule_always_inline; static RuleMultiLevelTiling rule_multi_level_tiling; @@ -58,7 +59,6 @@ static RuleSimplifyComputeWithConstTensor rule_simplify_compute_with_const_tenso static RuleSpecialComputeLocationGPU rule_special_compute_location_gpu; /********** Init population rules **********/ - static InitFillTileSize init_fill_tile_size; static InitChangeComputeLocation init_change_compute_location; static InitParallel init_parallel; @@ -66,23 +66,15 @@ static InitUnroll init_unroll; static InitVectorization init_vectorization; static InitThreadBind init_thread_bind; -/********** Mutation rules **********/ - -static MutateTileSize mutate_tile_size; -static MutateMaxUnrollFactor mutate_max_unroll_factor; -static MutateComputeLocation mutate_compute_location; -static MutateParallel mutate_parallel; - /********** Sketch policy **********/ - TVM_REGISTER_NODE_TYPE(SketchPolicyNode); -SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model, +SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, Map params, int seed, int verbose, Optional> init_search_callbacks) { auto node = make_object(); node->search_task = std::move(task); - node->schedule_cost_model = std::move(schedule_cost_model); + node->program_cost_model = std::move(program_cost_model); node->rand_gen = std::mt19937(seed); node->params = std::move(params); node->verbose = verbose; @@ -97,18 +89,32 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model, node->RunCallbacks(init_search_callbacks.value()); } - // Notice: Some rules require us to skip all the rest rules after they are applied. - // So the rules below should be ordered carefully. + // NOTE: There are strong dependency among the rules below, + // so the order to push them into the vector should be considered carefully. if (IsCPUTask(node->search_task)) { - // The default sketch rules for CPU policy + // Sketch Generation Rules node->sketch_rules.push_back(&rule_always_inline); node->sketch_rules.push_back(&rule_simplify_compute_with_const_tensor); node->sketch_rules.push_back(&rule_add_rfactor); node->sketch_rules.push_back(&rule_add_cache_write_stage); node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); node->sketch_rules.push_back(&rule_multi_level_tiling); - } else if (IsCUDATask(node->search_task)) { - // The default sketch rules for CUDA policy + node->sketch_rules.push_back(&rule_skip_stage); + + // Initial Population Generation Rules + node->init_rules.push_back(&init_fill_tile_size); + node->init_rules.push_back(&init_change_compute_location); + node->init_rules.push_back(&init_parallel); + node->init_rules.push_back(&init_unroll); + node->init_rules.push_back(&init_vectorization); + + // Mutation Rules for Evolutionary Search + node->mutation_rules.push_back(std::make_shared(0.90)); + node->mutation_rules.push_back(std::make_shared(0.04)); + node->mutation_rules.push_back(std::make_shared(0.05)); + node->mutation_rules.push_back(std::make_shared(0.01)); + } else if (IsGPUTask(node->search_task)) { + // Sketch Generation Rules node->sketch_rules.push_back(&rule_add_cache_read_stage); node->sketch_rules.push_back(&rule_always_inline); node->sketch_rules.push_back(&rule_special_compute_location_gpu); @@ -117,32 +123,20 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel schedule_cost_model, node->sketch_rules.push_back(&rule_add_cache_write_stage); node->sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); node->sketch_rules.push_back(&rule_multi_level_tiling); - } else { - LOG(FATAL) << "No default sketch rules for target: " << task->target; - } - node->sketch_rules.push_back(&rule_skip_stage); // This should always be the last rule + node->sketch_rules.push_back(&rule_skip_stage); - node->init_rules.push_back(&init_fill_tile_size); // This should always be the first rule - if (IsCPUTask(node->search_task)) { - // The default init population rules for CPU policy - node->init_rules.push_back(&init_change_compute_location); - node->init_rules.push_back(&init_parallel); - node->init_rules.push_back(&init_unroll); - node->init_rules.push_back(&init_vectorization); - } else if (IsCUDATask(node->search_task)) { - // The default init population rules for CUDA policy + // Initial Population Generation Rules + node->init_rules.push_back(&init_fill_tile_size); node->init_rules.push_back(&init_thread_bind); node->init_rules.push_back(&init_unroll); + + // Mutation Rules for Evolutionary Search + node->mutation_rules.push_back(std::make_shared(0.90)); + node->mutation_rules.push_back(std::make_shared(0.10)); } else { - LOG(FATAL) << "No default init rules for target: " << task->target; + LOG(FATAL) << "No default sketch rules for target: " << task->target; } - // The default mutation rules. - node->mutation_rules.push_back(&mutate_tile_size); - node->mutation_rules.push_back(&mutate_max_unroll_factor); - node->mutation_rules.push_back(&mutate_compute_location); - node->mutation_rules.push_back(&mutate_parallel); - data_ = std::move(node); } @@ -169,7 +163,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure if (!inputs.empty()) { // Retrain cost models before the next search round PrintTitle("Train cost model", verbose); - schedule_cost_model->Update(inputs, results); + program_cost_model->Update(inputs, results); } // Search one round to get promising states @@ -179,9 +173,7 @@ State SketchPolicyNode::Search(int n_trials, int early_stopping, int num_measure // Infer bound. This is necessary for computing the correct ToStr() for redundancy check best_states = search_task->compute_dag.InferBound(best_states); - PruneInvalidState(search_task, &best_states); random_states = search_task->compute_dag.InferBound(random_states); - PruneInvalidState(search_task, &random_states); // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state // Also pick some random states to do eps-greedy @@ -242,14 +234,16 @@ Array SketchPolicyNode::SearchOneRound(int num_random_states, Array( GetDoubleParam(params, SketchParamKey::EvolutionarySearch::use_measured_ratio) * population)); - bool is_cost_model_reasonable = !schedule_cost_model->IsInstance(); + bool is_cost_model_reasonable = !program_cost_model->IsInstance(); // 1. Generate sketches - const Array& sketches = GenerateSketches(); + if (sketch_cache_.empty()) { + sketch_cache_ = GenerateSketches(); + } // 2. Sample the init population Array init_population = SampleInitPopulation( - sketches, is_cost_model_reasonable ? population - num_use_measured : population); + sketch_cache_, is_cost_model_reasonable ? population - num_use_measured : population); // 3. If the cost model is useless (i.e. RandomCostModel), just random pick some generated // states, else perform evolutionary search @@ -260,7 +254,7 @@ Array SketchPolicyNode::SearchOneRound(int num_random_states, Array SketchPolicyNode::GenerateSketches() { // A map that maps state to its current working position (stage_id) std::unordered_map cur_stage_id_map; - cur_stage_id_map[init_state] = static_cast(init_state->stages.size() - 1); + cur_stage_id_map[init_state] = static_cast(init_state->stages.size()) - 1; // Derivation rule based enumeration Array out_states; @@ -341,28 +335,44 @@ Array SketchPolicyNode::GenerateSketches() { Array SketchPolicyNode::SampleInitPopulation(const Array& sketches, int out_size) { int fail_ct = 0; Array out_states; + std::vector rand_gens; + rand_gens.reserve(out_size); + for (int i = 0; i < out_size; i++) { + rand_gens.push_back(std::mt19937(rand_gen())); + } auto tic_begin = std::chrono::high_resolution_clock::now(); while (static_cast(out_states.size()) < out_size && fail_ct < out_size) { - // Random choose a starting sketch - // TODO(jcf94, merrymercy): Maybe choose sketches in different possibility for they may have - // different potential on generating state with better performance - State tmp_s = sketches[(rand_gen)() % sketches.size()]; - - // Derivation rule based enumeration - bool valid = true; - for (const auto& rule : init_rules) { - if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kInvalid) { - valid = false; - break; + std::vector temp_states(out_size); + + support::parallel_for(0, out_size - out_states.size(), + [this, &temp_states, &sketches, &rand_gens](int index) { + // Random choose a starting sketch + // TODO(jcf94, merrymercy): Maybe choose sketches in different + // possibility for they may have different potential on generating state + // with better performance + State tmp_s = sketches[(rand_gens[index])() % sketches.size()]; + // Derivation rule based enumeration + bool valid = true; + for (const auto& rule : init_rules) { + if (rule->Apply(this, &tmp_s, &rand_gens[index]) == + PopulationGenerationRule::ResultKind::kInvalid) { + valid = false; + break; + } + } + if (valid) { + temp_states[index] = std::move(tmp_s); + } + }); + + for (int i = 0; i < out_size; i++) { + if (temp_states[i].defined()) { + out_states.push_back(std::move(temp_states[i])); + } else { + fail_ct++; } } - - if (valid) { - out_states.push_back(std::move(tmp_s)); - } else { - fail_ct++; - } } double duration = std::chrono::duration_cast>( @@ -379,7 +389,7 @@ Array SketchPolicyNode::EvolutionarySearch(const Array& init_popul Array best_states; auto tic_begin = std::chrono::high_resolution_clock::now(); - size_t population = init_population.size(); + size_t population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population); int num_iters = GetIntParam(params, SketchParamKey::EvolutionarySearch::num_iters); double mutation_prob = GetDoubleParam(params, SketchParamKey::EvolutionarySearch::mutation_prob); @@ -390,135 +400,102 @@ Array SketchPolicyNode::EvolutionarySearch(const Array& init_popul Array* pnow = &states_buf1; Array* pnext = &states_buf2; - // The set of explored states to avoid redundancy. - std::unordered_set explored_set; - - // The heap to maintain the so far best states. + // A heap to keep the best states during evolution using StateHeapItem = std::pair; auto cmp = [](const StateHeapItem& left, const StateHeapItem& right) { return left.second > right.second; }; - using StateHeap = std::priority_queue, decltype(cmp)>; - StateHeap heap(cmp); - auto update_heap = [&heap, &explored_set](const Array& states, - const std::vector& scores, const int out_size) { - float max_score = 0.0; - for (size_t i = 0; i < states.size(); ++i) { - const State& state = states[i]; + std::vector heap; + std::unordered_set in_heap(measured_states_set_); + heap.reserve(out_size); + + // auxiliary global variables + std::vector pop_scores; + std::vector pop_selection_probs; + float max_score = 0.0; + pop_scores.reserve(population); + pop_selection_probs.reserve(population); + std::uniform_real_distribution<> dis(0.0, 1.0); + + // mutation rules + int mutation_success_ct, mutation_fail_ct; + mutation_success_ct = mutation_fail_ct = 0; + std::vector rule_weights; + std::vector rule_selection_probs; + for (const auto& rule : mutation_rules) { + rule_weights.push_back(rule->weight); + } + ComputePrefixSumProb(rule_weights, &rule_selection_probs); + + // Genetic Algorithm + for (int k = 0; k < num_iters + 1; ++k) { + // Maintain the heap + *pnow = search_task->compute_dag.InferBound(*pnow); + PruneInvalidState(search_task, pnow); + program_cost_model->Predict(search_task, *pnow, &pop_scores); + + for (size_t i = 0; i < pnow->size(); ++i) { + const State& state = (*pnow)[i]; std::string state_str = state.ToStr(); - // Skip redundant states. - if (explored_set.count(state_str) > 0) { - continue; - } - explored_set.insert(state_str); - - if (static_cast(heap.size()) < out_size) { - // Directly push item if the heap is not full yet. - heap.push({state, scores[i]}); - } else if (scores[i] > heap.top().second) { - // Replace the worst state in the heap with the new state. - heap.pop(); - heap.push({state, scores[i]}); + if (in_heap.count(state_str) == 0) { + if (static_cast(heap.size()) < out_size) { + heap.emplace_back((*pnow)[i], pop_scores[i]); + std::push_heap(heap.begin(), heap.end(), cmp); + in_heap.insert(state_str); + } else if (pop_scores[i] > heap.front().second) { + std::string old_state_str = heap.front().first.ToStr(); + in_heap.erase(old_state_str); + in_heap.insert(state_str); + + std::pop_heap(heap.begin(), heap.end(), cmp); + heap.back() = StateHeapItem(state, pop_scores[i]); + std::push_heap(heap.begin(), heap.end(), cmp); + } + if (pop_scores[i] > max_score) { + max_score = pop_scores[i]; + } } - max_score = (scores[i] > max_score) ? scores[i] : max_score; } - return max_score; - }; - // Cost model predicted scores. - std::vector scores; - scores.reserve(population); - - // The function to generate prefix sum probabilities based on the given scores. - auto assign_prob = [](const std::vector& scores, std::vector* prefix_sum_probs) { - // Compute selection probabilities. - double sum = 0.0; - prefix_sum_probs->resize(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - sum += std::max(scores[i], 0.0f); - (*prefix_sum_probs)[i] = sum; + // Print statistical information + if (k % 5 == 0 || k == num_iters) { + StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4) + << "\tMax score: " << max_score << "\tMin score: " << heap.front().second + << "\t#Pop: " << pnow->size() << "\t#M+: " << mutation_success_ct / (k + 1) + << "\t#M-: " << mutation_fail_ct / (k + 1) << std::endl; } - for (size_t i = 0; i < scores.size(); ++i) { - (*prefix_sum_probs)[i] /= sum; + if (k == num_iters) { + break; } - }; - // State selection probabilities. - std::uniform_real_distribution<> uniform_dist(0.0, 1.0); - std::vector state_select_probs; - state_select_probs.reserve(population); + // Compute selection probability + ComputePrefixSumProb(pop_scores, &pop_selection_probs); - // Mutation rule selection probabilities. - std::vector rule_select_probs; - rule_select_probs.reserve(mutation_rules.size()); - std::vector rule_levels; - for (const auto& rule : mutation_rules) { - rule_levels.push_back(rule->GetLevel(search_task)); - } - assign_prob(rule_levels, &rule_select_probs); - - // Evaluate the init populations. - *pnow = search_task->compute_dag.InferBound(*pnow); - PruneInvalidState(search_task, pnow); - CHECK_GT(pnow->size(), 0) << "All initial populations are invalid"; - schedule_cost_model->Predict(search_task, *pnow, &scores); - - // Maintain the best states in the heap. - float max_score = update_heap(*pnow, scores, out_size); - - // Genetic algorithm. - for (auto iter_idx = 1; iter_idx <= num_iters; ++iter_idx) { - // Assign the selection probability to each state based on the cost model scores. - assign_prob(scores, &state_select_probs); - - // TODO(@comaniac): Perform cross over. - - // Perform mutations. - size_t fail_ct = 0; - while (pnext->size() < population && fail_ct < population * 2) { - // Select a state to be mutated. - State tmp_s = (*pnow)[RandomChoose(state_select_probs, &rand_gen)]; - if (uniform_dist(rand_gen) < mutation_prob) { - // Select a rule and mutate the state. - const auto& rule = mutation_rules[RandomChoose(rule_select_probs, &rand_gen)]; - if (rule->Apply(this, &tmp_s) == PopulationGenerationRule::ResultKind::kValid) { + // Do mutation + while (pnext->size() < population) { + State tmp_s = (*pnow)[RandomChoose(pop_selection_probs, &rand_gen)]; + + if (dis(rand_gen) < mutation_prob) { + const auto& rule = mutation_rules[RandomChoose(rule_selection_probs, &rand_gen)]; + if (rule->Apply(this, &tmp_s, &rand_gen) == PopulationGenerationRule::ResultKind::kValid) { pnext->push_back(std::move(tmp_s)); + mutation_success_ct++; } else { - fail_ct++; + mutation_fail_ct++; } } else { - // Do not mutate this state in this round. pnext->push_back(std::move(tmp_s)); } } - // Evaluate the new populations. - *pnext = search_task->compute_dag.InferBound(*pnext); - PruneInvalidState(search_task, pnext); - - // Throw away all states generated in this iterations if all new states are invalid. - if (pnext->size() > 0) { - std::swap(pnext, pnow); - schedule_cost_model->Predict(search_task, *pnow, &scores); - - // Maintain the best states in the heap. - float iter_max_score = update_heap(*pnow, scores, out_size); - max_score = (iter_max_score > max_score) ? iter_max_score : max_score; - } + std::swap(pnext, pnow); pnext->clear(); - - if (iter_idx % 5 == 0 || iter_idx == num_iters) { - StdCout(verbose) << "GA Iter: " << iter_idx << std::fixed << std::setprecision(4) - << "\tMax Score: " << max_score << "\tPop Size: " << pnow->size() - << std::endl; - } } - // Copy best states in the heap to the output. - while (!heap.empty()) { - auto item = heap.top(); - heap.pop(); + // Copy best states in the heap to out_states + std::sort(heap.begin(), heap.end(), cmp); + for (auto& item : heap) { best_states.push_back(std::move(item.first)); } @@ -580,10 +557,10 @@ Array SketchPolicyNode::PickStatesWithEpsGreedy(const Array } TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicy") - .set_body_typed([](SearchTask task, CostModel schedule_cost_model, - Map params, int seed, int verbose, + .set_body_typed([](SearchTask task, CostModel program_cost_model, Map params, + int seed, int verbose, Optional> init_search_callbacks) { - return SketchPolicy(task, schedule_cost_model, params, seed, verbose, init_search_callbacks); + return SketchPolicy(task, program_cost_model, params, seed, verbose, init_search_callbacks); }); TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicyGenerateSketches") diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h index 2d93d8775c86..21aaa6ef7b90 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.h +++ b/src/auto_scheduler/search_policy/sketch_policy.h @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -88,15 +89,15 @@ struct SketchParamKey { class SketchPolicyNode : public SearchPolicyNode { public: /*! \brief The cost model to estimate the complete schedules. */ - CostModel schedule_cost_model; + CostModel program_cost_model; /*! \brief The parameters map for this search policy. */ Map params; /*! \brief The rules to generate sketches. */ std::vector sketch_rules; - /*! \brief The rules to generate initial states. */ + /*! \brief The rules to generate initial population. */ std::vector init_rules; - /*! \brief The rules to mutate states. */ - std::vector mutation_rules; + /*! \brief The rules to mutate states in the evolutionary search. */ + std::vector> mutation_rules; /*! \brief Random generator. */ std::mt19937 rand_gen; /*! \brief Memorize split space for Split. */ @@ -154,6 +155,9 @@ class SketchPolicyNode : public SearchPolicyNode { /*! \brief The number of states to measure per iteration. */ int num_measure_per_iter_; + + /*! \brief The cached sketches */ + Array sketch_cache_; }; /*! @@ -165,14 +169,14 @@ class SketchPolicy : public SearchPolicy { /*! * \brief The constructor. * \param task The SearchTask for the computation declaration. - * \param schedule_cost_model The cost model for complete programs. + * \param program_cost_model The cost model for complete programs. * \param params The parameters map for this search process. * \param seed The random seed of this search process. * \param verbose Verbose level. 0 for silent, 1 to output information during schedule * search. * \param init_search_callbacks SearchCallback to be called before schedule search. */ - SketchPolicy(SearchTask task, CostModel schedule_cost_model, Map params, + SketchPolicy(SearchTask task, CostModel program_cost_model, Map params, int seed, int verbose, Optional> init_search_callbacks); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchPolicy, SearchPolicy, SketchPolicyNode); diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc index dab6e4d65f20..089ac774d1d6 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc @@ -34,6 +34,9 @@ namespace tvm { namespace auto_scheduler { +static std::vector auto_unroll_configs_cpu = {0, 16, 64, 512}; +static std::vector auto_unroll_configs_gpu = {0, 16, 64, 512, 1024}; + /********** Sketch Generation Rule **********/ /********** RuleSkipStage **********/ @@ -436,8 +439,8 @@ std::vector> RuleSpecialComputeLocationGPU::Apply( /********** Init Population **********/ -PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { StateNode* pstate = state->CopyOnWrite(); // Scan the transformation history and randomly fill tiles size for all SplitStep for (size_t step_id = 0; step_id < (*state)->transform_steps.size(); ++step_id) { @@ -458,7 +461,7 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p const auto& candidate_lens = policy->split_memo.GetFactorizationSchemes( extent, ps->lengths.size(), GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor)); - const auto& candidate_lengths = candidate_lens[(policy->rand_gen)() % candidate_lens.size()]; + const auto& candidate_lengths = candidate_lens[(*rand_gen)() % candidate_lens.size()]; pstate->transform_steps.Set( step_id, @@ -472,9 +475,8 @@ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* p return ResultKind::kValid; } -PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNode* policy, - State* state, - bool infer_bound = true) { +PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply( + SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const { if (GetIntParam(policy->params, SketchParamKey::disable_change_compute_location)) { return PopulationGenerationRule::ResultKind::kValid; } @@ -490,83 +492,10 @@ PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNod continue; } - int target_stage_id = GetSingleConsumerId(policy->search_task, *state, stage_id); - if (target_stage_id < 0) { - continue; - } - const Stage& target_stage = (*state)->stages[target_stage_id]; - - std::vector> candidates; - bool target_compute_at_other = target_stage->compute_at == ComputeAtKind::kIter; - bool target_is_tiled = IsTiled(target_stage); - - bool visited_reduce = false; - // enumerate compute_at location at target_stage - // TODO(merrymercy): More analysis here to make smarter choices - for (size_t i = 0; i < target_stage->iters.size(); ++i) { - const Iterator& target_iter = target_stage->iters[i]; - if (target_iter->iter_kind == IteratorKind::kReduction) { - visited_reduce = true; - if (!target_is_tiled) { // Do not go into reduce iter - break; - } - } else if (target_iter->iter_kind == IteratorKind::kSpatial) { - if (visited_reduce) { // Do not go into inner tile - break; - } - } + std::vector> candidates = + GetComputeLocationCandidates(policy->search_task, *state, stage_id); - if (target_iter->annotation == IteratorAnnotation::kUnroll) { - // Do not go into the unroll region of const tensor indices - break; - } - - if (GetExtent(target_iter) == 1) { - // Skip iterators with length of 1 - continue; - } - if (target_compute_at_other && target_iter->iter_kind == IteratorKind::kSpatial && - StrEndsWith(target_iter->name, ".0")) { - // Skip the first level iterators if target stage compute_at another stage - // In this case, the lengths of first level iterators are always one - continue; - } - candidates.emplace_back(target_stage_id, i); - - if ((*state)->attach_map->iter_to_attached_stages.count(std::make_pair(target_stage_id, i))) { - break; - } - } - - // if the target_stage is already compute_at another stage X, try also compute_at X - // We call stage X as `target_target_stage` - if (target_compute_at_other) { - int target_target_stage_id; - target_target_stage_id = (*state)->attach_map->stage_to_attach_iter.at(target_stage_id).first; - const Stage& target_target_stage = (*state)->stages[target_target_stage_id]; - - for (size_t i = 0; i < target_target_stage->iters.size(); ++i) { - const Iterator& target_target_iter = target_target_stage->iters[i]; - if (target_target_iter->iter_kind == IteratorKind::kReduction || - (*state)->attach_map->iter_to_attached_stages.count( - std::make_pair(target_target_stage_id, i))) { - break; - } - - if (target_target_iter->annotation == IteratorAnnotation::kUnroll) { - // Do not go into the unroll region of const tensor indices - break; - } - - if (GetExtent(target_target_iter) == 1) { // skip iterators with length of 1 - continue; - } - - candidates.emplace_back(target_target_stage_id, i); - } - } - - int choice = (policy->rand_gen)() % (candidates.size() + 2); + int choice = (*rand_gen)() % (candidates.size() + 2); if (choice == 0) { if (!HasReduceIter(stage)) { @@ -585,19 +514,16 @@ PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNod } } - if (infer_bound) { + try { *state = policy->search_task->compute_dag.InferBound(*state); + } catch (std::exception& e) { + return PopulationGenerationRule::ResultKind::kInvalid; } return PopulationGenerationRule::ResultKind::kValid; } -PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode* policy, - State* state) const { - return MutateComputeLocationCommon(policy, state, true); -} - -PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { std::function annotate_parallel; annotate_parallel = [&annotate_parallel](const SketchPolicyNode& policy, State* state, @@ -661,11 +587,10 @@ PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* polic return ResultKind::kValid; } -PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, - State* state) const { - std::vector auto_unroll_configs = IsGPUTask(policy->search_task) - ? std::vector({0, 16, 64, 512, 1024}) - : std::vector({0, 16, 64, 512}); +PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { + std::vector& auto_unroll_configs = + IsGPUTask(policy->search_task) ? auto_unroll_configs_gpu : auto_unroll_configs_cpu; for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { const Stage& stage = (*state)->stages[stage_id]; // Skip the inlined stage and placeholder stage @@ -704,7 +629,7 @@ PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, if (HasReduceIter(stage)) { // Use auto unroll for multi level tiled stage - int value = auto_unroll_configs[(policy->rand_gen)() % auto_unroll_configs.size()]; + int value = auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]; state->pragma(stage_id, (*state)->stages[stage_id]->iters[0], std::string("auto_unroll_max_step") + "$" + std::to_string(value)); } @@ -714,7 +639,8 @@ PopulationGenerationRule::ResultKind InitUnroll::Apply(SketchPolicyNode* policy, } PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* policy, - State* state) const { + State* state, + std::mt19937* rand_gen) const { for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { const Stage& stage = (*state)->stages[stage_id]; // Skip the inlined stage and placeholder stage @@ -758,7 +684,7 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* if (num_fusible > 1) { // Select a random range to fuse - num_fusible = 1 + (policy->rand_gen)() % (num_fusible - 1); + num_fusible = 1 + (*rand_gen)() % (num_fusible - 1); } if (num_fusible == 1) { @@ -772,8 +698,8 @@ PopulationGenerationRule::ResultKind InitVectorization::Apply(SketchPolicyNode* return ResultKind::kValid; } -PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { std::set multi_level_tiling_root_set; for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { if (NeedsMultilevelTiling(policy->search_task, *state, stage_id)) { @@ -801,6 +727,10 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol // Deal with the cross-thread reduction generated by RuleCrossThreadReduction if (HasCrossThreadReduction(*state, stage_id)) { + if (stage->compute_at != ComputeAtKind::kRoot) { + continue; + } + Iterator fused_it; *state = std::move(FuseAllOuterSpaceIterators(*state, stage_id, &fused_it)); state->bind(stage_id, fused_it, IteratorAnnotation::kBlockX); @@ -922,8 +852,8 @@ PopulationGenerationRule::ResultKind InitThreadBind::Apply(SketchPolicyNode* pol return ResultKind::kValid; } -PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { int max_innermost_split_factor = GetIntParam(policy->params, SketchParamKey::max_innermost_split_factor); @@ -952,7 +882,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol const SplitStepNode* ps; do { - step_id = split_step_ids[(policy->rand_gen)() % split_step_ids.size()]; + step_id = split_step_ids[(*rand_gen)() % split_step_ids.size()]; ps = (*state)->transform_steps[step_id].as(); CHECK(ps != nullptr); extent = GetIntImm(ps->extent.value()); @@ -973,7 +903,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol // Random permute the tile size order. std::vector random_perm; - RandomPermutation(lengths.size(), &random_perm, &(policy->rand_gen)); + RandomPermutation(lengths.size(), &random_perm, rand_gen); // Try to divide a factor from one tile size and multiple it to another. for (size_t i = 0; i < random_perm.size(); ++i) { @@ -983,6 +913,7 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol continue; } + // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx] size_t dst_idx = random_perm[(i + 1) % random_perm.size()]; const std::vector& factors = policy->split_memo.GetFactors(length); CHECK_GE(factors.size(), 1); @@ -1000,9 +931,9 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol // Failed on this dst_idx, try next one. continue; } - divide_factor = factors[1 + (policy->rand_gen)() % (max_factor_index)]; + divide_factor = factors[1 + (*rand_gen)() % (max_factor_index)]; } else { - divide_factor = factors[1 + (policy->rand_gen)() % (factors.size() - 1)]; + divide_factor = factors[1 + (*rand_gen)() % (factors.size() - 1)]; } // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx]. @@ -1017,6 +948,8 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol } } + CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor); + StateNode* pstate = state->CopyOnWrite(); pstate->transform_steps.Set( step_id, SplitStep(ps->stage_id, ps->iter_id, ps->extent, @@ -1027,43 +960,103 @@ PopulationGenerationRule::ResultKind MutateTileSize::Apply(SketchPolicyNode* pol return ResultKind::kInvalid; } -PopulationGenerationRule::ResultKind MutateMaxUnrollFactor::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind MutateAutoUnroll::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { // Extract all auto_unroll_max_step pragma steps. - std::vector annotate_steps; + std::vector pragma_steps; for (size_t i = 0; i < (*state)->transform_steps.size(); ++i) { if (auto ps = (*state)->transform_steps[i].as()) { if (StrStartsWith(ps->pragma_type, "auto_unroll_max_step")) { - annotate_steps.push_back(i); + pragma_steps.push_back(i); } } } - if (annotate_steps.empty()) { + if (pragma_steps.empty()) { return ResultKind::kInvalid; } - // Random pick up one unroll factor candidate. - auto cands = (IsGPUTask(policy->search_task)) ? &gpu_unroll_cands_ : &cpu_unroll_cands_; - auto new_factor = std::to_string((*cands)[(policy->rand_gen)() % cands->size()]); + std::vector& auto_unroll_configs = + IsGPUTask(policy->search_task) ? auto_unroll_configs_gpu : auto_unroll_configs_cpu; - // Random pick up and mutate an unroll step. - auto step_id = annotate_steps[(policy->rand_gen)() % annotate_steps.size()]; + // Randomly pick up an auto unroll pragma step + auto step_id = pragma_steps[(*rand_gen)() % pragma_steps.size()]; auto ps = (*state)->transform_steps[step_id].as(); CHECK(ps); + + // Mutate its value to a random candidates + auto val = std::to_string(auto_unroll_configs[(*rand_gen)() % auto_unroll_configs.size()]); StateNode* pstate = state->CopyOnWrite(); - pstate->transform_steps.Set(step_id, - PragmaStep(ps->stage_id, ps->iter_id, - std::string("auto_unroll_max_step") + "$" + new_factor)); + pstate->transform_steps.Set(step_id, PragmaStep(ps->stage_id, ps->iter_id, + std::string("auto_unroll_max_step") + "$" + val)); return ResultKind::kValid; } PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNode* policy, - State* state) const { - return MutateComputeLocationCommon(policy, state, false); + State* state, + std::mt19937* rand_gen) const { + if (GetIntParam(policy->params, SketchParamKey::disable_change_compute_location)) { + return PopulationGenerationRule::ResultKind::kInvalid; + } + + // Extract all compute_at steps. + std::vector compute_at_steps; + for (size_t s = 0; s < (*state)->transform_steps.size(); ++s) { + if (auto ps = (*state)->transform_steps[s].as()) { + int stage_inc = GetTargetStageIDInState(*state, s) - ps->stage_id; + + if (IsTiled((*state)->stages[ps->stage_id + stage_inc])) { + continue; + } + + if (NeedsMultilevelTiling(policy->search_task, *state, ps->stage_id + stage_inc)) { + continue; + } + compute_at_steps.push_back(s); + } + } + if (compute_at_steps.empty()) { + return PopulationGenerationRule::ResultKind::kInvalid; + } + + // Randomly pick one step + size_t step_id = compute_at_steps[(*rand_gen)() % compute_at_steps.size()]; + auto ps = (*state)->transform_steps[step_id].as(); + int stage_inc = GetTargetStageIDInState(*state, step_id) - ps->stage_id; + CHECK(ps != nullptr); + + std::vector> candidates = + GetComputeLocationCandidates(policy->search_task, *state, ps->stage_id + stage_inc); + + if (candidates.empty()) { + return PopulationGenerationRule::ResultKind::kInvalid; + } + + int choice = (*rand_gen)() % (candidates.size()); + int new_compute_at_stage_id = candidates[choice].first; + int new_compute_at_iter_id = candidates[choice].second; + + // Replay a new state. + State tmp_s = policy->search_task->compute_dag->init_state; + for (size_t s = 0; s < (*state)->transform_steps.size(); ++s) { + if (s == step_id) { + tmp_s.CopyOnWrite()->transform_steps.push_back( + ComputeAtStep(ps->stage_id, new_compute_at_stage_id - stage_inc, new_compute_at_iter_id)); + } else { + tmp_s.CopyOnWrite()->transform_steps.push_back((*state)->transform_steps[s]); + } + try { + StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag); + } catch (dmlc::Error& e) { + return PopulationGenerationRule::ResultKind::kInvalid; + } + } + + *state = tmp_s; + return PopulationGenerationRule::ResultKind::kValid; } -PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy, - State* state) const { +PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const { // This mutation rule only focuses on a case that parallel was added to // the outermost loop and the loop is generated by fusing other loops. // In short, we mutate the fusion step before the parallel step. @@ -1087,7 +1080,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol } // Randomly pick one parallel step. - size_t step_id = parallel_steps[(policy->rand_gen)() % parallel_steps.size()]; + size_t step_id = parallel_steps[(*rand_gen)() % parallel_steps.size()]; auto ps = (*state)->transform_steps[step_id].as(); CHECK(ps); size_t stage_id = ps->stage_id; @@ -1126,7 +1119,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol // Mutate the fusion iters and replay the mutated fused/annotation steps. int iter_offset = 0; - if (RandomChoose(fuse_dir, &(policy->rand_gen)) == 0) { + if (RandomChoose(fuse_dir, rand_gen) == 0) { fused_ids.pop_back(); iter_offset = 1; } else { diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h index 418fbda6a030..928efc518827 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.h +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h @@ -124,7 +124,7 @@ DEFINE_SKETCH_GENERATION_RULE(RuleSpecialComputeLocationGPU); /********** Init Population **********/ -/*! \brief The base class for derivation rules used in the initial population. */ +/*! \brief The base class for rules used to annotate the sketches to get the initial population. */ class PopulationGenerationRule { public: /*! \brief Result enumeration of the apply function. */ @@ -137,19 +137,24 @@ class PopulationGenerationRule { * \param state The state to apply this rule, update inplace. * \return The result of this rule, indicate if there's any valid state generated. */ - virtual ResultKind Apply(SketchPolicyNode* policy, State* state) const = 0; + virtual ResultKind Apply(SketchPolicyNode* policy, State* state, + std::mt19937* rand_gen) const = 0; + + /*! \brief The deconstructor */ + virtual ~PopulationGenerationRule() = default; }; -#define DEFINE_INIT_POPULATION_RULE(rule_name) \ - class rule_name : public PopulationGenerationRule { \ - public: \ - ResultKind Apply(SketchPolicyNode* policy, State* state) const final; \ +// A helper to define population initialization rules +#define DEFINE_INIT_POPULATION_RULE(rule_name) \ + class rule_name : public PopulationGenerationRule { \ + public: \ + ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ }; /*! \brief The rule that fills the incomplete SplitSteps. */ DEFINE_INIT_POPULATION_RULE(InitFillTileSize); -/*! \brief The rule that randomly changes the computation location for some stages, which do not +/*! \brief The rule that randomly changes the computation location for some stages that do not * need tiling and are not strictly inlineable(e.g. data padding). */ DEFINE_INIT_POPULATION_RULE(InitChangeComputeLocation); @@ -170,50 +175,37 @@ DEFINE_INIT_POPULATION_RULE(InitThreadBind); /*! \brief The base class for mutation rules used in the evolutionary search. */ class PopulationMutationRule : public PopulationGenerationRule { public: - /*! - * \brief Get the priority level of this mutation rule. - * \return The priority level of this mutation rule. Higher the better. + /* \brief The constructor + * \param selection_weight the probabiliy of applying this rule is + * proportional to this weight */ - virtual int GetLevel(const SearchTask& task) const = 0; + explicit PopulationMutationRule(double selection_weight) : weight(selection_weight) {} + + /* \brief The weight of this rule */ + double weight; }; -// A helper to define mutation rules with a constant rule level. -#define DEFINE_MUTATE_POPULATION_RULE(rule_name, rule_level) \ - class rule_name : public PopulationMutationRule { \ - public: \ - ResultKind Apply(SketchPolicyNode* policy, State* state) const final; \ - int GetLevel(const SearchTask& task) const final { return rule_level; } \ +// A helper to define mutation rules used in the evolutionary search +#define DEFINE_MUTATE_POPULATION_RULE(rule_name) \ + class rule_name : public PopulationMutationRule { \ + public: \ + explicit rule_name(double weight) : PopulationMutationRule(weight) {} \ + ResultKind Apply(SketchPolicyNode* policy, State* state, std::mt19937* rand_gen) const final; \ }; /*! \brief The rule that mutates tile size by randomly dividing a tile size by a factor and multipling it to another tile size. */ -DEFINE_MUTATE_POPULATION_RULE(MutateTileSize, 100); - -/*! \brief The rule that mutates the fusion iterators annotated by parallel. */ -DEFINE_MUTATE_POPULATION_RULE(MutateParallel, 50); - -/*! \brief The rule that mutates the factor of a randomly selected auto max unroll step. */ -class MutateMaxUnrollFactor : public PopulationMutationRule { - public: - ResultKind Apply(SketchPolicyNode* policy, State* state) const final; - int GetLevel(const SearchTask& task) const final { return 10; } +DEFINE_MUTATE_POPULATION_RULE(MutateTileSize); - const std::vector cpu_unroll_cands_ = {0, 16, 64, 512, 1024}; - const std::vector gpu_unroll_cands_ = {0, 16, 64, 512}; -}; +/*! \brief The rule that mutates the number of fused outer iterators annotated by parallel. */ +DEFINE_MUTATE_POPULATION_RULE(MutateParallel); -/*! \brief The rule that randomly changes the computation location for some stages, which do not +/*! \brief The rule that randomly changes the computation location for some stages that do not * need tiling and are not strictly inlineable(e.g. data padding). */ -class MutateComputeLocation : public PopulationMutationRule { - public: - ResultKind Apply(SketchPolicyNode* policy, State* state) const final; - int GetLevel(const SearchTask& task) const final { - if (IsGPUTask(task)) { - return 0; - } - return 5; - } -}; +DEFINE_MUTATE_POPULATION_RULE(MutateComputeLocation); + +/*! \brief The rule that mutates the value of a randomly selected auto unroll pragma step. */ +DEFINE_MUTATE_POPULATION_RULE(MutateAutoUnroll); } // namespace auto_scheduler } // namespace tvm diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc index 62ffce4dc875..174ca3105a4b 100644 --- a/src/auto_scheduler/search_policy/utils.cc +++ b/src/auto_scheduler/search_policy/utils.cc @@ -67,6 +67,87 @@ Array GetSpatialSplitStepIds(const State& s, int stage_id) { return spatial_split_step_ids; } +std::vector> GetComputeLocationCandidates(const SearchTask& task, + const State& state, int stage_id) { + int target_stage_id = GetSingleConsumerId(task, state, stage_id); + if (target_stage_id < 0) { + return {}; + } + const Stage& target_stage = state->stages[target_stage_id]; + + std::vector> candidates; + bool target_compute_at_other = target_stage->compute_at == ComputeAtKind::kIter; + bool target_is_tiled = IsTiled(target_stage); + + bool visited_reduce = false; + // Enumerate compute_at location at target_stage + // TODO(merrymercy): More analysis here to make smarter choices + for (size_t i = 0; i < target_stage->iters.size(); ++i) { + const Iterator& target_iter = target_stage->iters[i]; + if (target_iter->iter_kind == IteratorKind::kReduction) { + visited_reduce = true; + if (!target_is_tiled) { // Do not go into reduce iter + break; + } + } else if (target_iter->iter_kind == IteratorKind::kSpatial) { + if (visited_reduce) { // Do not go into inner tile + break; + } + } + + if (target_iter->annotation == IteratorAnnotation::kUnroll) { + // Do not go into the unroll region of const tensor indices + break; + } + + if (GetExtent(target_iter) == 1) { + // Skip iterators with length of 1 + continue; + } + if (target_compute_at_other && target_iter->iter_kind == IteratorKind::kSpatial && + StrEndsWith(target_iter->name, ".0")) { + // Skip the first level iterators if target stage compute_at another stage + // In this case, the lengths of first level iterators are always one + continue; + } + candidates.emplace_back(target_stage_id, i); + + if (state->attach_map->iter_to_attached_stages.count(std::make_pair(target_stage_id, i))) { + break; + } + } + + // if the target_stage is already compute_at another stage X, try also compute_at X + // We call stage X as `target_target_stage` + if (target_compute_at_other) { + int target_target_stage_id; + target_target_stage_id = state->attach_map->stage_to_attach_iter.at(target_stage_id).first; + const Stage& target_target_stage = state->stages[target_target_stage_id]; + + for (size_t i = 0; i < target_target_stage->iters.size(); ++i) { + const Iterator& target_target_iter = target_target_stage->iters[i]; + if (target_target_iter->iter_kind == IteratorKind::kReduction || + state->attach_map->iter_to_attached_stages.count( + std::make_pair(target_target_stage_id, i))) { + break; + } + + if (target_target_iter->annotation == IteratorAnnotation::kUnroll) { + // Do not go into the unroll region of const tensor indices + break; + } + + if (GetExtent(target_target_iter) == 1) { // skip iterators with length of 1 + continue; + } + + candidates.emplace_back(target_target_stage_id, i); + } + } + + return candidates; +} + State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format, std::vector* spatial_split_step_ids) { // Temporal object to be used if the input pointer is nullptr @@ -327,25 +408,62 @@ void PruneInvalidState(const SearchTask& task, Array* states) { } if (pt == 0) { - LOG(INFO) << "All states are invalid."; + LOG(FATAL) << "Internal error: All states are invalid."; } else { states->resize(pt); } } +/********** SplitFactorizationMemo **********/ + +void SplitFactorizationMemo::ReadWriteLock::GetRead() { + std::unique_lock lock(cv_mutex_); + // Wake up and get the mutex lock if there's no writing thread + cv_.wait(lock, [this]() { return !this->is_writing_; }); + read_count_++; +} + +void SplitFactorizationMemo::ReadWriteLock::GetWrite() { + std::unique_lock lock(cv_mutex_); + // Wake up and get the mutex lock if there's no reading or writing threads + cv_.wait(lock, [this]() { return this->read_count_ == 0 && !this->is_writing_; }); + is_writing_ = true; +} + +void SplitFactorizationMemo::ReadWriteLock::UnlockRead() { + std::lock_guard lock(cv_mutex_); + read_count_--; + // Notify the other blocked threads if this is the last reading thread + if (read_count_ == 0) { + cv_.notify_one(); + } +} + +void SplitFactorizationMemo::ReadWriteLock::UnlockWrite() { + std::lock_guard lock(cv_mutex_); + is_writing_ = false; + // Notify the other blocked threads + cv_.notify_one(); +} + const Array>& SplitFactorizationMemo::GetFactorizationSchemes( int extent, int n_lengths, int max_innermost_factor) { QueryKey key = std::make_tuple(extent, n_lengths, max_innermost_factor); - auto it = memory_.find(key); - if (it != memory_.end()) { + const auto& const_memory = memory_; + lock_.GetRead(); + const auto& it = const_memory.find(key); + const auto& memory_end = const_memory.end(); + lock_.UnlockRead(); + if (it != memory_end) { return it->second; } + lock_.GetWrite(); tmp_stack_ = Array(n_lengths, Integer()); results_ = &memory_[key]; n_lengths_ = n_lengths; - DfsEnumerate(0, extent, max_innermost_factor); + lock_.UnlockWrite(); return *results_; } @@ -383,6 +501,8 @@ const std::vector& SplitFactorizationMemo::GetFactors(int n) { return res; } +/********** Utils interface API for ffi **********/ + TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsTiled") .set_body_typed([](const Stage& stage) { return IsTiled(stage); }); diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h index d2ba1289a5b5..6c0fb4c4dcf4 100644 --- a/src/auto_scheduler/search_policy/utils.h +++ b/src/auto_scheduler/search_policy/utils.h @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -372,7 +373,8 @@ inline bool HasSingleElementwiseMatchedConsumer(const SearchTask& task, const St *target_stage_id = *consumers.begin(); if (ElementwiseMatch(task, state, stage_id, *target_stage_id) && (!(HasReduceIter(state->stages[stage_id]) && - HasReduceIter(state->stages[*target_stage_id])))) { + HasReduceIter(state->stages[*target_stage_id]))) && + (!StrEndsWith(state->stages[*target_stage_id]->op->name, ".shared"))) { return true; } } @@ -535,6 +537,22 @@ inline Iterator GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) { return stage->iters[0]; } +/*! \brief Get the target stage id of a history step in the new state. + * We need this because the stage_id in the history may be stale due to later steps */ +inline int GetTargetStageIDInState(const State& s, int step_id) { + int stage_inc = 0; + + for (size_t i = step_id + 1; i < s->transform_steps.size(); ++i) { + if (s->transform_steps[i]->IsInstance() || + s->transform_steps[i]->IsInstance() || + s->transform_steps[i]->IsInstance()) { + if (s->transform_steps[i]->stage_id <= s->transform_steps[step_id]->stage_id + stage_inc) + stage_inc++; + } + } + return s->transform_steps[step_id]->stage_id + stage_inc; +} + /*! \brief Get all split steps for one stage. */ inline void GetSplitStepIds(const State& s, int stage_id, std::vector* split_step_ids) { for (int i = static_cast(s->transform_steps.size()) - 1; i >= 0; --i) { @@ -615,6 +633,32 @@ inline Array RandomSampleStates(const Array& in_states, std::mt199 return out_states; } +/*! \brief Compute prefix-sum probabiilty based on the given weights */ +inline void ComputePrefixSumProb(const std::vector& weights, + std::vector* prefix_sum_probs) { + // Compute selection probabilities. + float sum = 0.0; + prefix_sum_probs->resize(weights.size()); + for (size_t i = 0; i < weights.size(); ++i) { + sum += std::max(weights[i], 0.0f); + (*prefix_sum_probs)[i] = sum; + } + for (size_t i = 0; i < weights.size(); ++i) { + (*prefix_sum_probs)[i] /= sum; + } +} + +/*! \brief Random choose an index according to a prefix sum probability. */ +inline int RandomChoose(const std::vector& prefix_sum_probs, std::mt19937* random_gen) { + std::uniform_real_distribution<> dis(0.0, 1.0); + double x = dis(*random_gen); + + CHECK(!prefix_sum_probs.empty()); + + return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) - + prefix_sum_probs.begin(); +} + /*! \brief Print a title */ inline void PrintTitle(const std::string& title, int verbose) { StdCout(verbose) << Chars('-', 60) << "\n" @@ -637,6 +681,33 @@ class SplitFactorizationMemo { private: void DfsEnumerate(int now, int remaining_length, int max_innermost_factor); + /*! + * \brief A simple implementation of read-write lock. + * The guarded block can be read by multiple threads at the same time, while other operations will + * be blocked if one thread is writing. + * \note Writing threads will wait until all reading threads have finshed. If there're multiple + * writing threads, the process order of them is not guaranteed. + */ + class ReadWriteLock { + public: + /*! \brief The method to get the read lock. One thread can process read if there's on other + * writing threads. */ + void GetRead(); + /*! \brief The method to get the write lock. One thread can process write if there's on other + * reading or writing threads. */ + void GetWrite(); + /*! \brief The method to release the read lock. */ + void UnlockRead(); + /*! \brief The method to release the write lock. */ + void UnlockWrite(); + + private: + uint32_t read_count_ = 0; + bool is_writing_ = false; + std::mutex cv_mutex_; + std::condition_variable cv_; + } lock_; + std::unordered_map>> memory_; int n_lengths_; @@ -648,6 +719,10 @@ class SplitFactorizationMemo { /*! \brief Get the indexes of SplitStep that processes on spatial iterator. */ Array GetSpatialSplitStepIds(const State& s, int stage_id); +/*! \brief Get the possible compute locations for a stage. */ +std::vector> GetComputeLocationCandidates(const SearchTask& task, + const State& state, int stage_id); + // Apply multi-level tiling structure according to a string format, // where "S" stands a space level, "R" stands for a reduction level. // For example, if the format is "SSRSRS", then we will @@ -662,17 +737,6 @@ State DoMultiLevelTiling(const State& state, int stage_id, const std::string& fo State FollowTiling(const State& state, int stage_id, const std::vector& split_step_ids, int n_split); -// Random choose an index according to a prefix sum probability. -inline int RandomChoose(const std::vector& prefix_sum_probs, std::mt19937* random_gen) { - std::uniform_real_distribution<> dis(0.0, 1.0); - double x = dis(*random_gen); - - CHECK(!prefix_sum_probs.empty()); - - return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) - - prefix_sum_probs.begin(); -} - // Prune invalid states and return the results in-place. void PruneInvalidState(const SearchTask& task, Array* states); diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc index cec83bb93515..2a9349739752 100755 --- a/src/auto_scheduler/transform_step.cc +++ b/src/auto_scheduler/transform_step.cc @@ -780,7 +780,9 @@ Array ApplySplitToState(State* state, int stage_id, int iter_id, res = Iterator(name, Range(), it->iter_kind, IteratorAnnotation::kNone); tosplit_min = NullOpt; tosplit_extent = NullOpt; - concrete = false; + if (!l.defined()) { + concrete = false; + } } outs.push_back(std::move(res)); } diff --git a/src/node/attr_registry.h b/src/node/attr_registry.h index 7b233a865f55..01d2b68c471b 100644 --- a/src/node/attr_registry.h +++ b/src/node/attr_registry.h @@ -37,7 +37,7 @@ namespace tvm { /*! - * \breif Implementation of registry with attributes. + * \brief Implementation of registry with attributes. * * \tparam EntryType The type of the registry entry. * \tparam KeyType The actual key that is used to lookup the attributes. diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index 7c32f31fad89..d21cb1f2d9b3 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -57,7 +57,7 @@ class VarCountingSHashHandler : public SHashReducer::Handler { * the correct value. */ ObjectRef object; - /*! \biref The partially reduce hash value.*/ + /*! \brief The partially reduce hash value.*/ size_t reduced_hash; /*! \brief The expected location in the result stack. */ size_t result_stack_index = std::numeric_limits::max(); diff --git a/src/parser/diagnostic.h b/src/parser/diagnostic.h index 2eb38b312242..085d1c4ea8fb 100644 --- a/src/parser/diagnostic.h +++ b/src/parser/diagnostic.h @@ -139,7 +139,7 @@ DiagnosticBuilder Diagnostic::Help(Span span) { } /*! \brief A diagnostic context for recording errors against a source file. - * TODO(@jroesch): convert source map and improve in follow up PR, the parser + * TODO(jroesch): convert source map and improve in follow up PR, the parser * assumes a single global file for now. */ struct DiagnosticContext { @@ -160,7 +160,7 @@ struct DiagnosticContext { Render(std::cout); } - // TODO(@jroesch): eventually modularize the rendering interface to provide control of how to + // TODO(jroesch): eventually modularize the rendering interface to provide control of how to // format errors. void Render(std::ostream& ostream) { for (auto diagnostic : diagnostics) { diff --git a/src/printer/tir_hybrid_printer.cc b/src/printer/tir_hybrid_printer.cc index b58e5fc2e9a0..0fadf17c11a3 100644 --- a/src/printer/tir_hybrid_printer.cc +++ b/src/printer/tir_hybrid_printer.cc @@ -68,7 +68,7 @@ class TIRHybridPrinter : public StmtFunctor, std::unordered_set var_not_in_headers; /*! \brief buffer collector (buffer defined in BufferMap and BufferAllocation)*/ std::unordered_set buf_not_in_headers; - /*! \breif Map from Var to thread env name */ + /*! \brief Map from Var to thread env name */ std::unordered_map var_env_map_; /*! \brief Map from Var to Doc */ std::unordered_map memo_var_; diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 6ba1ce777f4f..2b08f45b2582 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -353,7 +353,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor { } /*! * \brief Check if we can release token. - * \tok The token to be released. + * \param tok The token to be released. */ void CheckForRelease(StorageToken* tok) { CHECK_GE(tok->storage_id, 0); diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 8ab1ab1be66d..38ebe421d38d 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -972,9 +972,10 @@ bool DilateRel(const Array& types, int num_inputs, const Attrs& attrs, } // Positional relay function to create dilate operator used by frontend FFI. -Expr MakeDilate(Expr data, Array strides) { +Expr MakeDilate(Expr data, Array strides, double dilation_value = 0.0) { auto attrs = make_object(); attrs->strides = std::move(strides); + attrs->dilation_value = std::move(dilation_value); static const Op& op = Op::Get("nn.dilate"); return Call(op, {data}, Attrs(attrs), {}); } @@ -983,7 +984,7 @@ TVM_REGISTER_GLOBAL("relay.op.nn._make.dilate").set_body_typed(MakeDilate); RELAY_REGISTER_OP("nn.dilate") .describe(R"code( -Dilate data with zeros. +Dilate data with given dilation value (0 by default). )code" TVM_ADD_FILELINE) .set_num_inputs(1) .add_argument("x", "1D Tensor", "Data to dilate.") diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index c9807c39764e..441a76ef98d3 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -3126,6 +3126,8 @@ RELAY_REGISTER_OP("sparse_to_dense") .set_attr("FTVMCompute", SparseToDenseCompute); // relay.matrix_set_diag +TVM_REGISTER_NODE_TYPE(MatrixSetDiagAttrs); + bool MatrixSetDiagRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { // `types` contains: [input, diagonal, result] @@ -3137,13 +3139,28 @@ bool MatrixSetDiagRel(const Array& types, int num_inputs, const Attrs& att const auto* diagonal = types[1].as(); CHECK(diagonal); + const auto param = attrs.as(); + CHECK_GE(param->k2, param->k1); + int d_ndims = diagonal->shape.size(); - for (int i = 0; i < d_ndims - 1; i++) { + int i_ndims = input->shape.size(); + + reporter->Assert(input->shape[i_ndims - 2] > -param->k1); + reporter->Assert(input->shape[i_ndims - 1] > param->k2); + + for (int i = 0; i < d_ndims - 2; i++) { reporter->AssertEQ(input->shape[i], diagonal->shape[i]); } - auto min_dim = if_then_else(input->shape[d_ndims - 1] >= input->shape[d_ndims], - input->shape[d_ndims], input->shape[d_ndims - 1]); - reporter->Assert(diagonal->shape[d_ndims - 1] >= min_dim); + if (param->k1 != param->k2) { + reporter->AssertEQ(diagonal->shape[d_ndims - 2], param->k2 - param->k1 + 1); + } else if (d_ndims >= 2) { + reporter->AssertEQ(input->shape[d_ndims - 2], diagonal->shape[d_ndims - 2]); + } + auto max_diag_len = if_then_else(input->shape[i_ndims - 2] + (param->k2 > 0 ? param->k2 : 0) <= + input->shape[i_ndims - 1] + (param->k1 < 0 ? -param->k1 : 0), + input->shape[i_ndims - 2] + (param->k2 > 0 ? param->k2 : 0), + input->shape[i_ndims - 1] + (param->k1 < 0 ? -param->k1 : 0)); + reporter->AssertEQ(diagonal->shape[d_ndims - 1], max_diag_len); reporter->Assign(types[2], TensorType(input->shape, input->dtype)); return true; @@ -3151,22 +3168,37 @@ bool MatrixSetDiagRel(const Array& types, int num_inputs, const Attrs& att Array MatrixSetDiagCompute(const Attrs& attrs, const Array& inputs, const Type& out_type) { - return Array{topi::matrix_set_diag(inputs[0], inputs[1])}; -} - -Expr MakeMatrixSetDiag(Expr input, Expr diagonal) { + const auto* param = attrs.as(); + CHECK(param != nullptr); + return Array{topi::matrix_set_diag(inputs[0], inputs[1], param->k1, param->k2, + param->super_diag_right_align, + param->sub_diag_right_align)}; +} + +Expr MakeMatrixSetDiag(Expr input, Expr diagonal, int k1, int k2, bool super_diag_right_align, + bool sub_diag_right_align) { + auto attrs = make_object(); + attrs->k1 = k1; + attrs->k2 = k2; + attrs->super_diag_right_align = super_diag_right_align; + attrs->sub_diag_right_align = sub_diag_right_align; static const Op& op = Op::Get("matrix_set_diag"); - return Call(op, {input, diagonal}, Attrs(), {}); + return Call(op, {input, diagonal}, Attrs(attrs), {}); } TVM_REGISTER_GLOBAL("relay.op._make.matrix_set_diag").set_body_typed(MakeMatrixSetDiag); RELAY_REGISTER_OP("matrix_set_diag") .describe( - R"code(Returns a tensor with the diagonal of input tensor replaced with the provided diagonal values. + R"code(Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values. **input** Input tensor. **diagonal** Values to be filled in the diagonal. + **k1** Lower limit (included) of the range of diagonals. + **k2** Upper limit (included) of the range of diagonals. + **super_diag_right_align** Bool, true iff super-diagonal is right aligned (left-padded). + **sub_diag_right_align** Bool, true iff sub-diagonal is right aligned (left-padded). )code" TVM_ADD_FILELINE) + .set_attrs_type() .set_num_inputs(2) .add_argument("input", "Tensor", "Input Tensor.") .add_argument("diagonal", "Tensor", "Values to be filled in the diagonal.") diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc index c307d75a9aba..74c236ae3280 100644 --- a/src/relay/transforms/annotate_target.cc +++ b/src/relay/transforms/annotate_target.cc @@ -35,9 +35,10 @@ namespace tvm { namespace relay { namespace annotate_target { -const PackedFunc* make_begin_op = +static const PackedFunc* make_begin_op = runtime::Registry::Get("relay.op.annotation._make.compiler_begin"); -const PackedFunc* make_end_op = runtime::Registry::Get("relay.op.annotation._make.compiler_end"); +static const PackedFunc* make_end_op = + runtime::Registry::Get("relay.op.annotation._make.compiler_end"); // A helper class to insert annotation boundaries for a program region that will // be handled by a specific compiler. diff --git a/src/relay/transforms/convert_layout.cc b/src/relay/transforms/convert_layout.cc index 9a71642aac13..65fdeda5f6cd 100644 --- a/src/relay/transforms/convert_layout.cc +++ b/src/relay/transforms/convert_layout.cc @@ -88,22 +88,23 @@ class ConvertTransformMemorizer : public TransformMemorizer { Expr new_e; bool modified = false; if (fconvert_layout.count(op)) { - tvm::Array tinfos; - for (auto expr : ref_call->args) { - auto ttype = expr->type_as(); - tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype)); - } - auto desired_layouts = operator->()->desired_layouts_; - if (desired_layouts.find(op->name) == desired_layouts.end()) { - LOG(FATAL) << "Desired layout(s) not specified for op: " << op->name; - } - Array op_desired_layouts = desired_layouts.at(op->name); - Expr altered_value = - fconvert_layout[op](ref_call->attrs, new_args, tinfos, op_desired_layouts); - if (altered_value.defined()) { - new_e = altered_value; - modified = true; + if (desired_layouts.find(op->name) != desired_layouts.end()) { + tvm::Array tinfos; + for (auto expr : ref_call->args) { + auto ttype = expr->type_as(); + tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype)); + } + + Array op_desired_layouts = desired_layouts.at(op->name); + Expr altered_value = + fconvert_layout[op](ref_call->attrs, new_args, tinfos, op_desired_layouts); + if (altered_value.defined()) { + new_e = altered_value; + modified = true; + } + } else { + LOG(WARNING) << "Desired layout(s) not specified for op: " << op->name; } } if (!modified) { diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/gradient.cc index 9c472542cc91..bf8105080317 100644 --- a/src/relay/transforms/gradient.cc +++ b/src/relay/transforms/gradient.cc @@ -338,7 +338,7 @@ Expr FirstOrderGradient(const Expr& re, const Optional& mod) { TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient); -Type bpt = RelayRefType(FuncType({}, TupleType(Array()), {}, {})); +static Type bpt = RelayRefType(FuncType({}, TupleType(Array()), {}, {})); struct ReverseADType : TypeMutator { Type VisitType_(const TensorTypeNode* ttn) final { diff --git a/src/relay/transforms/infer_layout_util.h b/src/relay/transforms/infer_layout_util.h index 9868ee5d03db..5cc180e8e2e3 100644 --- a/src/relay/transforms/infer_layout_util.h +++ b/src/relay/transforms/infer_layout_util.h @@ -142,7 +142,7 @@ inline Array> BinaryBroadcastLayout(const Attrs& attrs, if (!layouts[0].defined() && !layouts[1].defined()) { // both undefined, infer fails - return Array>{{Layout::Undef()}, {Layout::Undef()}}; + return Array>{{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}}; } else if (!layouts[0].defined() || !layouts[1].defined()) { // only one is defined, use shape information to help infer int defined_idx = layouts[0].defined() ? 0 : 1; @@ -157,7 +157,7 @@ inline Array> BinaryBroadcastLayout(const Attrs& attrs, // only know the tensor with smaller dimensions, // so we cannot infer the final broadcasted output. // fails in this case. - return Array>{{Layout::Undef()}, {Layout::Undef()}}; + return Array>{{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}}; } } else if (layouts[0].defined() && layouts[1].defined() && (layouts[0].ndim() == 0 || layouts[1].ndim() == 0)) { diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index d9555570d9fa..a82dc0a0697c 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -428,7 +428,8 @@ IRModule RemoveDefaultAnnotations(IRModule module) { */ // New annotations would be required to be added for each flattened output -const PackedFunc* make_end_op = runtime::Registry::Get("relay.op.annotation._make.compiler_end"); +static const PackedFunc* make_end_op = + runtime::Registry::Get("relay.op.annotation._make.compiler_end"); IRModule FlattenTupleOutputs(IRModule module) { class TupleOutFlattener : public ExprRewriter { diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 1c860b8ffb3f..9895ff6987ad 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -148,7 +148,7 @@ void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, pt TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) { LOG(FATAL) << "Device does not support stream api."; - return 0; + return nullptr; } void DeviceAPI::FreeStream(TVMContext ctx, TVMStreamHandle stream) { @@ -462,7 +462,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked API_BEGIN(); if (fin == nullptr) { *out = new PackedFunc([func, resource_handle](TVMArgs args, TVMRetValue* rv) { - int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) + int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, resource_handle); if (ret != 0) { throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); @@ -473,7 +473,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked // so fin will be called when the lambda went out of scope. std::shared_ptr rpack(resource_handle, fin); *out = new PackedFunc([func, rpack](TVMArgs args, TVMRetValue* rv) { - int ret = func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) + int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, rpack.get()); if (ret != 0) { throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index f2d2fca64055..3fb4e0a3575f 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -31,6 +31,7 @@ #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB #include #include +#include #include #include #include @@ -139,12 +140,13 @@ class ACLRuntime : public JSONRuntimeBase { CreateGlobalPoolingLayer(&layer_, node); } else if ("reshape" == op_name) { CreateReshapeLayer(&layer_, node); + } else if ("maximum" == op_name) { + CreateMaximumLayer(&layer_, node); } else { LOG(FATAL) << "Unsupported op: " << op_name; } } } - this->layer_.function->prepare(); if (num_pools > 0) mm->populate(this->allocator_, num_pools); } @@ -401,6 +403,21 @@ class ACLRuntime : public JSONRuntimeBase { layer->function = function; } + /*! + * \brief Create a maximum layer. + * + * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function. + * \param node The JSON representation of the operator. + */ + void CreateMaximumLayer(CachedLayer* layer, const JSONGraphNode& node) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0])); + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1])); + layer->outputs.push_back(MakeACLTensorFromJSONNode(node)); + auto function = std::make_shared(); + function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]); + layer->function = function; + } + /*! \brief Allow ACL functions to request auxiliary memory from TVM. */ ACLAllocator allocator_; /*! diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index b69ecf26808e..f7b88ccdd964 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -217,7 +217,7 @@ class CUDADeviceAPI final : public DeviceAPI { private: static void GPUCopy(const void* from, void* to, size_t size, cudaMemcpyKind kind, cudaStream_t stream) { - if (stream != 0) { + if (stream != nullptr) { CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); } else { CUDA_CALL(cudaMemcpy(to, from, size, kind)); diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc index 498a9b703a7b..bf844c1ad798 100644 --- a/src/runtime/cuda/cuda_module.cc +++ b/src/runtime/cuda/cuda_module.cc @@ -169,9 +169,9 @@ class CUDAWrappedFunc { } CUstream strm = static_cast(CUDAThreadEntry::ThreadLocal()->stream); ThreadWorkLoad wl = thread_axis_cfg_.Extract(args); - CUresult result = - cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2), - wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), 0, strm, void_args, 0); + CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), + wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1), + wl.block_dim(2), 0, strm, void_args, nullptr); if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { const char* msg; cuGetErrorName(result, &msg); diff --git a/src/runtime/hexagon/sim/driver/CMakeLists.txt b/src/runtime/hexagon/sim/driver/CMakeLists.txt index 8632b491f259..a98cfe07fcdc 100644 --- a/src/runtime/hexagon/sim/driver/CMakeLists.txt +++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt @@ -56,6 +56,8 @@ target_include_directories(sim_dev PUBLIC "." PUBLIC ".." PUBLIC "../../../../../include" +) +target_include_directories(sim_dev SYSTEM PUBLIC "../../../../../3rdparty/dlpack/include" ) diff --git a/src/runtime/hexagon/target/fastrpc/CMakeLists.txt b/src/runtime/hexagon/target/fastrpc/CMakeLists.txt index 072b9ca62fb2..0d790d76f7f7 100644 --- a/src/runtime/hexagon/target/fastrpc/CMakeLists.txt +++ b/src/runtime/hexagon/target/fastrpc/CMakeLists.txt @@ -27,10 +27,10 @@ endif() set(FASTRPC_SRC "${CMAKE_CURRENT_SOURCE_DIR}") include_directories(include) -include_directories(${HEXAGON_SDK_ROOT}/incs) -include_directories(${HEXAGON_SDK_ROOT}/incs/stddef) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef) include_directories( - ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64) + SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64) set(QAIC_EXE "${HEXAGON_SDK_ROOT}/tools/qaic/Ubuntu16/qaic") set(QAIC_FLAGS @@ -96,9 +96,9 @@ if("${FASTRPC_LIBS}" STREQUAL "SKEL") endif() include_directories( - ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt) + SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt) include_directories( - ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix) + SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix) # Extra compile flags (both C and C++). set(EXTRA_COMP_FLAGS @@ -158,11 +158,11 @@ if("${FASTRPC_LIBS}" STREQUAL "SKEL") else() # Stub libraries. # - include_directories(${HEXAGON_SDK_ROOT}/incs/a1std) - include_directories(${HEXAGON_SDK_ROOT}/incs/qlist) - include_directories(${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc) + include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/a1std) + include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/qlist) + include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/rpcmem/inc) link_directories( - ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64) + SYSTEM ${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64) add_library(tvm_remote_nd_stub SHARED "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}" diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc index 651e19cdbd37..a5935491fcd7 100644 --- a/src/runtime/library_module.cc +++ b/src/runtime/library_module.cc @@ -74,7 +74,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr& TVMValue ret_value; int ret_type_code = kTVMNullptr; int ret = (*faddr)(const_cast(args.values), const_cast(args.type_codes), - args.num_args, &ret_value, &ret_type_code, NULL); + args.num_args, &ret_value, &ret_type_code, nullptr); CHECK_EQ(ret, 0) << TVMGetLastError(); if (ret_type_code != kTVMNullptr) { *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code); diff --git a/src/support/socket.h b/src/support/socket.h index 3ccfaaab5ab5..d70f956a51fb 100644 --- a/src/support/socket.h +++ b/src/support/socket.h @@ -131,9 +131,9 @@ struct SockAddr { hints.ai_family = PF_UNSPEC; hints.ai_flags = AI_PASSIVE; hints.ai_socktype = SOCK_STREAM; - addrinfo* res = NULL; - int sig = getaddrinfo(host, NULL, &hints, &res); - CHECK(sig == 0 && res != NULL) << "cannot obtain address of " << host; + addrinfo* res = nullptr; + int sig = getaddrinfo(host, nullptr, &hints, &res); + CHECK(sig == 0 && res != nullptr) << "cannot obtain address of " << host; switch (res->ai_family) { case AF_INET: { sockaddr_in* addr4 = reinterpret_cast(&addr); @@ -403,7 +403,7 @@ class TCPSocket : public Socket { * \return The accepted socket connection. */ TCPSocket Accept() { - SockType newfd = accept(sockfd, NULL, NULL); + SockType newfd = accept(sockfd, nullptr, nullptr); if (newfd == INVALID_SOCKET) { Socket::Error("Accept"); } diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 205a8a44c1ea..1f6eedde0b21 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -106,7 +106,7 @@ class CodeGenAMDGPU : public CodeGenLLVM { llvm::Type* type = llvm::ArrayType::get(DTypeToLLVMType(op->dtype), constant_size); // Allocate shared memory in global, address_space = 3 llvm::GlobalVariable* global = new llvm::GlobalVariable( - *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", nullptr, + *module_, type, false, llvm::GlobalValue::PrivateLinkage, nullptr, ".shared", nullptr, llvm::GlobalValue::NotThreadLocal, shared_address_space); if (global->getAlignment() < static_cast(info.alignment)) { #if TVM_LLVM_VERSION >= 100 diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index 127889d52fbd..53104542417e 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -225,8 +225,9 @@ void CodeGenCPU::AddMainFunction(const std::string& entry_func_name) { llvm::Function* f = module_->getFunction(entry_func_name); CHECK(f) << "Function " << entry_func_name << "does not in module"; llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1); - llvm::GlobalVariable* global = new llvm::GlobalVariable( - *module_, type, true, llvm::GlobalValue::WeakAnyLinkage, 0, runtime::symbol::tvm_module_main); + llvm::GlobalVariable* global = + new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr, + runtime::symbol::tvm_module_main); #if TVM_LLVM_VERSION >= 100 global->setAlignment(llvm::Align(1)); #else @@ -349,7 +350,7 @@ llvm::Value* CodeGenCPU::CreateCallExtern(Type ret_type, String global_symbol, llvm::GlobalVariable* CodeGenCPU::InitContextPtr(llvm::Type* p_type, std::string name) { llvm::GlobalVariable* gv = new llvm::GlobalVariable( - *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, 0, name); + *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name); #if TVM_LLVM_VERSION >= 100 gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(p_type))); #else @@ -552,8 +553,9 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task) { } llvm::Value* CodeGenCPU::CreateStaticHandle() { - llvm::GlobalVariable* gv = new llvm::GlobalVariable( - *module_, t_void_p_, false, llvm::GlobalValue::PrivateLinkage, 0, "__tvm_static_handle"); + llvm::GlobalVariable* gv = + new llvm::GlobalVariable(*module_, t_void_p_, false, llvm::GlobalValue::PrivateLinkage, + nullptr, "__tvm_static_handle"); #if TVM_LLVM_VERSION >= 100 gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(t_void_p_))); #else diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index c52f9b06929e..a7e96c95e07f 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -237,7 +237,7 @@ llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, String global_symbo llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::string name) { llvm::GlobalVariable* gv = new llvm::GlobalVariable( - *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, 0, name); + *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name); #if TVM_LLVM_VERSION >= 100 gv->setAlignment(llvm::Align(data_layout_->getTypeAllocSize(p_type))); #else diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 4ffd6b25c237..cb04e6b8055b 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -628,8 +628,8 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) { auto it = str_map_.find(str); if (it != str_map_.end()) return it->second; llvm::Type* type = llvm::ArrayType::get(t_char_, str.length() + 1); - llvm::GlobalVariable* global = - new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::PrivateLinkage, 0, ".str"); + llvm::GlobalVariable* global = new llvm::GlobalVariable( + *module_, type, true, llvm::GlobalValue::PrivateLinkage, nullptr, ".str"); #if TVM_LLVM_VERSION >= 100 global->setAlignment(llvm::Align(1)); #else diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc index e9999f1ca283..601df86d10ba 100644 --- a/src/target/llvm/codegen_nvptx.cc +++ b/src/target/llvm/codegen_nvptx.cc @@ -82,7 +82,7 @@ class CodeGenNVPTX : public CodeGenLLVM { llvm::Type* type = llvm::ArrayType::get(DTypeToLLVMType(op->dtype), constant_size); // Allocate shared memory in global, address_space = 3 llvm::GlobalVariable* global = new llvm::GlobalVariable( - *module_, type, false, llvm::GlobalValue::PrivateLinkage, 0, ".shared", nullptr, + *module_, type, false, llvm::GlobalValue::PrivateLinkage, nullptr, ".shared", nullptr, llvm::GlobalValue::NotThreadLocal, shared_address_space); #if TVM_LLVM_VERSION >= 100 global->setAlignment(llvm::Align(info.alignment)); diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc index 7fd2352a3d19..ae9584f488af 100644 --- a/src/tir/transforms/lower_custom_datatypes.cc +++ b/src/tir/transforms/lower_custom_datatypes.cc @@ -97,7 +97,7 @@ class CustomDatatypesLowerer : public StmtExprMutator { return expr; } -#define DEFINE_MUTATE__(OP, NodeName) \ +#define DEFINE_MUTATE(OP, NodeName) \ inline PrimExpr VisitExpr_(const NodeName* op) final { \ auto type_code = op->dtype.code(); \ bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \ @@ -112,19 +112,19 @@ class CustomDatatypesLowerer : public StmtExprMutator { return expr; \ } - DEFINE_MUTATE__(Add, AddNode); - DEFINE_MUTATE__(Sub, SubNode); - DEFINE_MUTATE__(Mul, MulNode); - DEFINE_MUTATE__(Div, DivNode); - DEFINE_MUTATE__(Mod, ModNode); - DEFINE_MUTATE__(Min, MinNode); - DEFINE_MUTATE__(Max, MaxNode); - DEFINE_MUTATE__(EQ, EQNode); - DEFINE_MUTATE__(NE, NENode); - DEFINE_MUTATE__(LT, LTNode); - DEFINE_MUTATE__(LE, LENode); - DEFINE_MUTATE__(GT, GTNode); - DEFINE_MUTATE__(GE, GENode); + DEFINE_MUTATE(Add, AddNode); + DEFINE_MUTATE(Sub, SubNode); + DEFINE_MUTATE(Mul, MulNode); + DEFINE_MUTATE(Div, DivNode); + DEFINE_MUTATE(Mod, ModNode); + DEFINE_MUTATE(Min, MinNode); + DEFINE_MUTATE(Max, MaxNode); + DEFINE_MUTATE(EQ, EQNode); + DEFINE_MUTATE(NE, NENode); + DEFINE_MUTATE(LT, LTNode); + DEFINE_MUTATE(LE, LENode); + DEFINE_MUTATE(GT, GTNode); + DEFINE_MUTATE(GE, GENode); // Later changes may need to add more mutate functions as we support workloads with more ops. private: diff --git a/src/topi/nn.cc b/src/topi/nn.cc index 4a209b2f2932..c03d1b056d35 100644 --- a/src/topi/nn.cc +++ b/src/topi/nn.cc @@ -75,7 +75,7 @@ TVM_REGISTER_GLOBAL("topi.nn.batch_matmul").set_body([](TVMArgs args, TVMRetValu /* Ops from nn/dilate.h */ TVM_REGISTER_GLOBAL("topi.nn.dilate").set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = nn::dilate(args[0], args[1]); + *rv = nn::dilate(args[0], args[1], args[2]); }); /* Ops from nn/flatten.h */ diff --git a/src/topi/schedule.cc b/src/topi/schedule.cc index ead803b84cf9..83457ced9f16 100644 --- a/src/topi/schedule.cc +++ b/src/topi/schedule.cc @@ -21,7 +21,6 @@ * \brief Registration of TVM schedules * \file schedule.cc */ -#define TOPI_REDUCE_ATLEAST1D 0 #include #include diff --git a/src/topi/transform.cc b/src/topi/transform.cc index bf7e1e67c247..d79952e2494f 100644 --- a/src/topi/transform.cc +++ b/src/topi/transform.cc @@ -177,7 +177,11 @@ TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) { }); TVM_REGISTER_GLOBAL("topi.matrix_set_diag").set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = matrix_set_diag(args[0], args[1]); + int k1 = args[2]; + int k2 = args[3]; + bool super_diag_right_align = args[4]; + bool sub_diag_right_align = args[5]; + *rv = matrix_set_diag(args[0], args[1], k1, k2, super_diag_right_align, sub_diag_right_align); }); TVM_REGISTER_GLOBAL("topi.adv_index").set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py new file mode 100644 index 000000000000..8ddb901946fc --- /dev/null +++ b/tests/python/contrib/test_arm_compute_lib/test_maximum.py @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Arm Compute Library integration reshape tests.""" + +import numpy as np + +import tvm +from tvm import relay + +from .infrastructure import ( + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, +) +from .infrastructure import Device + + +def _get_model(input_shape, dtype, var_names): + """Return a model and any parameters it may have.""" + a = relay.var(next(var_names), shape=input_shape, dtype=dtype) + b = relay.var(next(var_names), shape=input_shape, dtype=dtype) + max = relay.maximum(a, b) + return max + + +def _get_expected_codegen(shape, dtype): + node = { + "op": "kernel", + "name": "maximum", + "inputs": [[0, 0, 0], [1, 0, 0]], + "attrs": { + "num_inputs": "2", + "num_outputs": "1", + "shape": [[list(shape)]], + "dtype": [[dtype]], + }, + } + + inputs = [ + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}, + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}, + ] + inputs.append(node) + return inputs + + +def test_maximum(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + np.random.seed(0) + + for dtype, low, high, atol, rtol in [ + ("float32", -127, 128, 0.001, 0.001), + ("float32", -1, 1, 0.001, 0.001), + ]: + inputs = { + "a": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)), + "b": tvm.nd.array(np.random.uniform(low, high, (100, 100)).astype(dtype)), + } + outputs = [] + func = _get_model(inputs["a"].shape, dtype, iter(inputs)) + + for acl in [False, True]: + outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0]) + + verify(outputs, atol=1e-7, rtol=1e-7) + + +def test_codegen_maximum(): + if skip_codegen_test(): + return + + shape = (100, 100) + inputs = {"a", "b"} + for dtype in ["float32"]: + args = (shape, dtype) + func = _get_model(*args, iter(inputs)) + exp_codegen = _get_expected_codegen(*args) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_maximum() + test_codegen_maximum() diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index bda0f472148b..1aeb430de52f 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -1969,6 +1969,7 @@ def verify_prelu(x_shape, a_shape): verify_prelu([3, 4, 5, 6], [1, 4, 1, 1]) verify_prelu([1, 8, 5, 6], [1, 8, 1, 1]) verify_prelu([2, 12, 16, 16], [1, 12, 1, 1]) + verify_prelu([2, 12, 16, 16], [1]) # Test alpha broadcasting. @tvm.testing.uses_gpu diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 83ba22b7c1d9..8c1143646426 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -17,6 +17,7 @@ # pylint: disable=import-self, invalid-name, unused-argument """Unit tests for various models and operators""" from time import time +import os import sys from scipy.stats import t as tdistr import numpy as np @@ -1519,6 +1520,10 @@ class ToDouble(Module): def forward(self, x): return x.double() + class ToFloat16(Module): + def forward(self, x): + return x.to(torch.float16) + verify_model(ToCPU().eval(), torch.rand((1, 3, 32, 32))) verify_model(ToFloat().eval(), torch.zeros((1, 3, 32, 32), dtype=torch.int)) verify_model(ToFloat().eval(), torch.tensor(2, dtype=torch.int)) @@ -1526,6 +1531,8 @@ def forward(self, x): verify_model(ToInt().eval(), torch.tensor(0.8)) verify_model(ToLong().eval(), torch.tensor(0.8)) verify_model(ToDouble().eval(), torch.tensor(0.8)) + verify_model(ToFloat16().eval(), torch.tensor(2, dtype=torch.float32)) + verify_model(ToFloat16().eval(), torch.zeros((1, 3, 32, 32), dtype=torch.int)) @tvm.testing.uses_gpu @@ -3286,6 +3293,43 @@ def test_forward_pretrained_bert_base_uncased(): print("TVM top-1 id: {}, token: {}".format(tvm_pred_idx, tvm_pred_token)) +def test_convert_torch_script_with_input_types(): + def model_fn(x, y): + x = x.to(dtype=torch.int32) + y = x + y + return y + + ishape = (4, 5) + input_x = torch.rand(ishape, dtype=torch.float32) + input_y = torch.randint(low=0, high=100, size=ishape, dtype=torch.int32) + inputs = [input_x, input_y] + script_module = torch.jit.trace(model_fn, inputs) + + fname = "tmp.pt" + torch.jit.save(script_module, fname) + loaded = torch.jit.load(fname) + os.remove(fname) + + verify_model(loaded.eval(), input_data=inputs) + + def expected(x_shape, y_shape): + # use a fixed order of args so alpha equal check can pass + x = relay.var("x", shape=x_shape, dtype="float32") + y = relay.var("y", shape=y_shape, dtype="int32") + args = [x, y] + x1 = relay.cast(x, "int32") + y1 = relay.add(x1, y) + mod = tvm.IRModule.from_expr(relay.Function(args, y1)) + return mod["main"] + + input_infos = [("input0", (ishape, "float")), ("input1", (ishape, "int"))] + mod, params = relay.frontend.from_pytorch(loaded, input_infos) + + expected_mod = expected(ishape, ishape) + + assert tvm.ir.structural_equal(expected_mod, mod["main"], map_free_vars=True) + + if __name__ == "__main__": # some structural tests test_forward_traced_function() @@ -3446,3 +3490,6 @@ def test_forward_pretrained_bert_base_uncased(): # Test bert model test_forward_pretrained_bert_base_uncased() + + # Test convert torch script(jit) with specific inputs' types + test_convert_torch_script_with_input_types() diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index d24a733f7655..c13f679e6108 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -740,18 +740,24 @@ def test_any_pad(): verify_any_pad(any_dims(4), ((1, 0), (1, 3), (0, 2), (9, 0)), (13, 11, 3, 1)) -def verify_any_dilate(data_shape, strides, static_data_shape): +def verify_any_dilate(data_shape, strides, static_data_shape, dilation_value=None): assert len(data_shape) == len(strides) mod = tvm.IRModule() dtype = "float32" data = relay.var("data", shape=data_shape, dtype=dtype) - y = relay.nn.dilate(data, strides) + if dilation_value is None: + y = relay.nn.dilate(data, strides) + else: + y = relay.nn.dilate(data, strides, dilation_value) mod["main"] = relay.Function([data], y) data_np = np.random.uniform(size=static_data_shape).astype(dtype) ref_shape = tuple( (static_data_shape[i] - 1) * strides[i] + 1 for i in range(len(static_data_shape)) ) - ref_out = np.zeros(shape=ref_shape, dtype=dtype) + if dilation_value is None: + dilation_value = 0.0 + ref_out = np.ones(shape=ref_shape, dtype=dtype) + ref_out = dilation_value * ref_out ref_out[tuple(slice(None, None, strides[i]) for i in range(len(data_shape)))] = data_np check_result([data_np], mod, ref_out) @@ -766,6 +772,7 @@ def test_any_dilate(): verify_any_dilate(any_dims(3), (1, 1, 5), (1, 2, 3)) verify_any_dilate(any_dims(3), (3, 7, 5), (1, 2, 3)) verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4)) + verify_any_dilate(any_dims(4), (3, 7, 1, 5), (1, 2, 3, 4), 1.0) def verify_any_softmax(data_shape, axis, static_data_shape, ref_out_shape): diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 8ad40a617b34..bc565682d932 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -533,12 +533,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype): @tvm.testing.uses_gpu def test_matrix_set_diag(): - def _verify(input_shape, dtype): - diagonal_shape = list(input_shape[:-2]) - diagonal_shape.append(min(input_shape[-2], input_shape[-1])) + def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): input = relay.var("input", relay.TensorType(input_shape, dtype)) diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype)) - out = relay.matrix_set_diag(input, diagonal) + out = relay.matrix_set_diag(input, diagonal, k, align) in_type = run_infer_type(input) out_type = run_infer_type(out) @@ -547,7 +545,7 @@ def _verify(input_shape, dtype): func = relay.Function([input, diagonal], out) input_np = np.random.randint(-100, 100, size=input_shape).astype(dtype) diagonal_np = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype) - out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np) + out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align) for target, ctx in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: @@ -555,9 +553,12 @@ def _verify(input_shape, dtype): out_relay = intrp.evaluate(func)(input_np, diagonal_np) tvm.testing.assert_allclose(out_relay.asnumpy(), out_np) - _verify((2, 2), "float32") - _verify((4, 3, 3), "int32") - _verify((2, 3, 4), "float32") + _verify((2, 2), (2,), "float32") + _verify((4, 3, 3), (4, 3), "int32") + _verify((2, 3, 4), (2, 3), "float32", 1) + _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_RIGHT") + _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "LEFT_LEFT") + _verify((2, 3, 4), (2, 4, 3), "int32", (-1, 2), "RIGHT_RIGHT") if __name__ == "__main__": diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py index 9954c0143a68..c765de48d7f8 100644 --- a/tests/python/relay/test_pass_convert_op_layout.py +++ b/tests/python/relay/test_pass_convert_op_layout.py @@ -52,6 +52,33 @@ def expected(): assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) +def test_qnn_binary_no_convert_layout(): + def before(): + x = relay.var("x", shape=(2, 2)) + y = relay.var("y", shape=(1, 2)) + return relay.Function( + [x, y], + relay.qnn.op.add( + x, + y, + lhs_scale=relay.const(0.0156863, "float32"), + lhs_zero_point=relay.const(127, "int32"), + rhs_scale=relay.const(0.0117647, "float32"), + rhs_zero_point=relay.const(85, "int32"), + output_scale=relay.const(0.0235294, "float32"), + output_zero_point=relay.const(128, "int32"), + ), + ) + + def expected(): + return before() + + a = before() + a = run_opt_pass(a, transform.ConvertLayout({})) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + + def test_conv_convert_layout(): def before(): x = relay.var("x", shape=(1, 56, 56, 64)) @@ -1041,7 +1068,57 @@ def expected(): assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) +def test_no_desired_layout(): + def before(): + x = relay.var("x", shape=(1, 64, 56, 56)) + weight1 = relay.var("weight1", shape=(64, 64, 3, 3)) + y = relay.nn.conv2d( + x, + weight1, + channels=64, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NCHW", + kernel_layout="OIHW", + ) + rois = relay.var("rois", shape=(32, 5)) + y = relay.vision.roi_align( + y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW" + ) + y = relay.Function(analysis.free_vars(y), y) + return y + + def expected(): + x = relay.var("x", shape=(1, 64, 56, 56)) + weight1 = relay.var("weight1", shape=(64, 64, 3, 3)) + x = relay.layout_transform(x, "NCHW", "NHWC") + weight1 = relay.layout_transform(weight1, "OIHW", "HWIO") + y = relay.nn.conv2d( + x, + weight1, + channels=64, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + y = relay.layout_transform(y, "NHWC", "NCHW") + rois = relay.var("rois", shape=(32, 5)) + y = relay.vision.roi_align( + y, rois, pooled_size=(14, 14), spatial_scale=0.0625, sample_ratio=2, layout="NCHW" + ) + y = relay.Function(analysis.free_vars(y), y) + return y + + a = before() + a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "HWIO"]})) + b = run_opt_pass(expected(), transform.InferType()) + + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + + if __name__ == "__main__": + test_qnn_binary_no_convert_layout() test_no_convert_layout() test_conv_convert_layout() test_conv_nhwc_convert_layout() @@ -1061,3 +1138,4 @@ def expected(): test_conv_roi_align_convert_layout() test_default_keyword() test_different_ops_convert_layout() + test_no_desired_layout() diff --git a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py index 742892d19fa9..267cfbe4c990 100644 --- a/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_transpose_nchw.py @@ -62,23 +62,18 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() - def check_device(device, ctx): - print("Running on target: %s" % device) - with tvm.target.Target(device): - fcompute, fschedule = tvm.topi.testing.dispatch( - device, _conv2d_transpose_nchw_implement - ) - B = fcompute( - A, - W, - [stride_height, stride_width], - [pad_top, pad_left, pad_bottom, pad_right], - A.dtype, - output_padding, - ) - C = topi.nn.relu(B) - s1 = fschedule([B]) - s2 = fschedule([C]) + def check(fcompute, fschedule, device, ctx): + B = fcompute( + A, + W, + [stride_height, stride_width], + [pad_top, pad_left, pad_bottom, pad_right], + A.dtype, + output_padding, + ) + C = topi.nn.relu(B) + s1 = fschedule([B]) + s2 = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) @@ -91,6 +86,22 @@ def check_device(device, ctx): tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + def check_generic(device, ctx): + print("Running generic on target: %s" % device) + with tvm.target.Target(device): + fcompute, fschedule = _conv2d_transpose_nchw_implement["generic"] + check(fcompute, fschedule, device, ctx) + + check_generic("llvm", tvm.cpu(0)) + + def check_device(device, ctx): + print("Running on target: %s" % device) + with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch( + device, _conv2d_transpose_nchw_implement + ) + check(fcompute, fschedule, device, ctx) + for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx) diff --git a/tests/python/topi/python/test_topi_dilate.py b/tests/python/topi/python/test_topi_dilate.py index ab5c61dce406..0ee51a6c7bf4 100644 --- a/tests/python/topi/python/test_topi_dilate.py +++ b/tests/python/topi/python/test_topi_dilate.py @@ -26,12 +26,18 @@ def test_dilate(): target = "llvm" ctx = tvm.cpu(0) - def _test_dilate(input_size, strides): + def _test_dilate(input_size, strides, dilation_value=None): Input = te.placeholder((input_size)) - Output = topi.nn.dilate(Input, strides) + if dilation_value is None: + Output = topi.nn.dilate(Input, strides) + else: + Output = topi.nn.dilate(Input, strides, dilation_value) schedule = te.create_schedule(Output.op) input_np = np.random.uniform(size=input_size).astype(Input.dtype) - output_np = tvm.topi.testing.dilate_python(input_np, strides) + if dilation_value is None: + output_np = tvm.topi.testing.dilate_python(input_np, strides) + else: + output_np = tvm.topi.testing.dilate_python(input_np, strides, dilation_value) input_tvm = tvm.nd.array(input_np, ctx=ctx) output_size = topi.util.get_const_tuple(Output.shape) output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx) @@ -47,6 +53,7 @@ def _test_dilate(input_size, strides): _test_dilate((1, 32, 32, 3, 3), (2, 2, 2, 2, 2)) _test_dilate((1, 32, 32, 32, 3, 3), (1, 1, 1, 2, 2, 2)) _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1)) + _test_dilate((1, 32, 32, 32, 3, 3), (2, 2, 2, 1, 1, 1), 1.0) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_group_conv2d.py b/tests/python/topi/python/test_topi_group_conv2d.py index d97716bbb3db..45e77d19082d 100644 --- a/tests/python/topi/python/test_topi_group_conv2d.py +++ b/tests/python/topi/python/test_topi_group_conv2d.py @@ -35,6 +35,10 @@ "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw), } +_group_conv2d_nhwc_implement = { + "generic": (topi.nn.group_conv2d_nhwc, topi.generic.schedule_group_conv2d_nhwc), +} + def verify_group_conv2d_nchw( batch, @@ -275,6 +279,119 @@ def check_device(device): check_device(device) +def verify_group_conv2d_nhwc( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + groups, + add_bias=False, + add_relu=False, +): + print( + "Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups) + ) + + in_height = in_width = in_size + + A = te.placeholder((batch, in_height, in_width, in_channel), name="A") + W = te.placeholder((kernel, kernel, in_channel // groups, num_filter), name="W") + bias = te.placeholder((1, 1, num_filter), name="bias") + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + bias_shape = get_const_tuple(bias.shape) + dtype = A.dtype + + @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nhwc") + def get_ref_data(): + a_np = np.random.uniform(size=a_shape).astype(dtype) + w_np = np.random.uniform(size=w_shape).astype(dtype) + b_np = np.random.uniform(size=bias_shape).astype(dtype) + dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) + c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding, groups).astype( + dtype + ) + + if add_bias: + b_np = np.random.uniform(size=bias_shape).astype(dtype) + c_np += b_np + if add_relu: + c_np = np.maximum(c_np, 0) + + return a_np, w_np, b_np, c_np + + a_np, w_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + if not tvm.testing.device_enabled(device): + print("Skip because %s is not enabled" % device) + return + + print("Running on target: %s" % device) + with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(device, _group_conv2d_nhwc_implement) + C = fcompute(A, W, stride, padding, dilation, groups, dtype) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = fschedule([C]) + + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) + if add_bias: + func = tvm.build( + s, + [A, W, bias, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" + % ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + groups, + ), + ) + func(a, w, b, c) + else: + func = tvm.build( + s, + [A, W, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" + % ( + batch, + in_channel, + in_size, + num_filter, + kernel, + stride, + padding, + dilation, + groups, + ), + ) + func(a, w, c) + tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + + for device in ["llvm"]: + check_device(device) + + @tvm.testing.uses_gpu def test_group_conv2d_nchw(): # ResNeXt-50 workload @@ -325,6 +442,30 @@ def test_group_conv2d_NCHWc_int8(): verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32) +def test_group_conv2d_nhwc(): + # ResNeXt-50 workload + verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32) + verify_group_conv2d_nhwc(1, 256, 56, 256, 3, 2, 1, 1, 32) + verify_group_conv2d_nhwc(1, 256, 28, 256, 3, 1, 1, 1, 32) + verify_group_conv2d_nhwc(1, 512, 28, 512, 3, 2, 1, 1, 32) + verify_group_conv2d_nhwc(1, 512, 14, 512, 3, 1, 1, 1, 32) + verify_group_conv2d_nhwc(1, 1024, 14, 1024, 3, 2, 1, 1, 32) + verify_group_conv2d_nhwc(1, 1024, 7, 1024, 3, 1, 1, 1, 32) + + # bias, relu + verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True) + verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True) + verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True, add_bias=True) + + # dilation + verify_group_conv2d_nhwc(1, 128, 56, 128, 3, 1, 1, 2, 32) + + # batch size + verify_group_conv2d_nhwc(2, 128, 56, 128, 3, 1, 1, 1, 32) + verify_group_conv2d_nhwc(9, 128, 56, 128, 3, 1, 1, 1, 32) + + if __name__ == "__main__": test_group_conv2d_nchw() test_group_conv2d_NCHWc_int8() + test_group_conv2d_nhwc() diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py index a32d41a27e17..f18b5397eefe 100644 --- a/tests/python/topi/python/test_topi_transform.py +++ b/tests/python/topi/python/test_topi_transform.py @@ -715,12 +715,10 @@ def check_device(device, ctx): check_device(device, ctx) -def verify_matrix_set_diag(input_shape, dtype): - diagonal_shape = list(input_shape[:-2]) - diagonal_shape.append(min(input_shape[-2], input_shape[-1])) +def verify_matrix_set_diag(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"): input = te.placeholder(shape=input_shape, name="input", dtype=dtype) diagonal = te.placeholder(shape=diagonal_shape, name="diagonal", dtype=dtype) - matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal) + matrix_set_diag_result = topi.transform.matrix_set_diag(input, diagonal, k, align) def check_device(device, ctx): ctx = tvm.context(device, 0) @@ -730,7 +728,7 @@ def check_device(device, ctx): fn = tvm.build(s, [input, diagonal, matrix_set_diag_result], device, name="matrix_set_diag") input_npy = np.random.randint(-100, 100, size=input_shape).astype(dtype) diagonal_npy = np.random.randint(-100, 100, size=diagonal_shape).astype(dtype) - out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy) + out_npy = tvm.topi.testing.matrix_set_diag(input_npy, diagonal_npy, k, align) input_nd = tvm.nd.array(input_npy, ctx) diagonal_nd = tvm.nd.array(diagonal_npy, ctx) out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(matrix_set_diag_result.dtype), ctx) @@ -1165,9 +1163,12 @@ def test_sparse_to_dense(): @tvm.testing.uses_gpu def test_matrix_set_diag(): for dtype in ["float32", "int32"]: - verify_matrix_set_diag((2, 2), dtype) - verify_matrix_set_diag((4, 3, 3), dtype) - verify_matrix_set_diag((2, 3, 4), dtype) + verify_matrix_set_diag((2, 2), (2,), dtype) + verify_matrix_set_diag((4, 3, 3), (4, 3), dtype) + verify_matrix_set_diag((2, 3, 4), (2, 3), dtype, 1) + verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_RIGHT") + verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "LEFT_LEFT") + verify_matrix_set_diag((2, 3, 4), (2, 4, 3), dtype, (-1, 2), "RIGHT_RIGHT") @tvm.testing.uses_gpu diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py index 0571ede488d1..c3afa6c65627 100644 --- a/tests/python/unittest/test_arith_rewrite_simplify.py +++ b/tests/python/unittest/test_arith_rewrite_simplify.py @@ -916,6 +916,12 @@ def test_cast_simplify(): ck.verify(tvm.tir.Cast(dtype1, tvm.tir.const(i, dtype2)), tvm.tir.const(i, dtype1)) +def test_shift_left_simplify(): + ck = RewriteChecker() + z = tvm.tir.op.call_intrin("int32", "tir.shift_left", 1, 10) + ck.verify(z, tvm.tir.const(1 << 10, "int32")) + + if __name__ == "__main__": test_floordiv_index_simplify() test_floormod_index_simplify() @@ -932,3 +938,4 @@ def test_cast_simplify(): test_logical_simplify() test_let_simplify() test_cast_simplify() + test_shift_left_simplify() diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py index 33e498ecdfd0..eaf328c6303a 100644 --- a/tests/python/unittest/test_auto_scheduler_common.py +++ b/tests/python/unittest/test_auto_scheduler_common.py @@ -40,6 +40,19 @@ def matmul_auto_scheduler_test(N, M, K): return [A, B, C] +@auto_scheduler.register_workload +def double_matmul_auto_scheduler_test(N): + A = te.placeholder((N, N), name="A", dtype="float32") + B = te.placeholder((N, N), name="B", dtype="float32") + C = te.placeholder((N, N), name="C", dtype="float32") + k = te.reduce_axis((0, N), name="k") + D = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="D") + k = te.reduce_axis((0, N), name="k") + E = te.compute((N, N), lambda i, j: te.sum(D[i][k] * C[k][j], axis=[k]), name="E") + + return [A, B, C, E] + + # Test for register_workload with different name @auto_scheduler.register_workload("matmul_auto_scheduler_test_rename_1") def matmul_auto_scheduler_test_rename_0(N, M, K): diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py index eb706b7e6976..bf6efd0bf11d 100644 --- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py +++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py @@ -47,7 +47,7 @@ def test_evo_search(): workload_key = auto_scheduler.make_workload_key(matmul_auto_scheduler_test, (10, 10, 4)) dag = auto_scheduler.ComputeDAG(workload_key) task = auto_scheduler.SearchTask(dag, workload_key, tvm.target.Target("llvm")) - policy = auto_scheduler.SketchPolicy(task, schedule_cost_model=MockCostModel(), verbose=0) + policy = auto_scheduler.SketchPolicy(task, program_cost_model=MockCostModel(), verbose=0) states = policy.sample_initial_population(50) pruned_states = [] for state in states: diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py index aba27840a61f..caa1d6a99f40 100644 --- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py +++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py @@ -45,7 +45,7 @@ def test_layout_rewrite_correctness(): workload = matmul_auto_scheduler_test workload_key = auto_scheduler.make_workload_key(workload, (N, N, N)) dag = auto_scheduler.ComputeDAG(workload_key) - target = tvm.target.create(target) + target = tvm.target.Target(target) task = auto_scheduler.SearchTask(dag, workload_key, target) with tempfile.NamedTemporaryFile() as fp: diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py index 6ec96a6f544a..04b54b2858cf 100644 --- a/tests/python/unittest/test_auto_scheduler_search_policy.py +++ b/tests/python/unittest/test_auto_scheduler_search_policy.py @@ -57,7 +57,7 @@ def search_common( search_policy = auto_scheduler.EmptyPolicy(task) elif search_policy == "sketch": search_policy = auto_scheduler.SketchPolicy( - task, schedule_cost_model=cost_model, init_search_callbacks=init_search_callbacks + task, program_cost_model=cost_model, init_search_callbacks=init_search_callbacks ) tuning_options = auto_scheduler.TuningOptions( diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py index fa67756833bf..5a687daf686a 100644 --- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py +++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py @@ -25,6 +25,7 @@ from test_auto_scheduler_common import ( matmul_auto_scheduler_test, + double_matmul_auto_scheduler_test, conv2d_nchw_bn_relu_auto_scheduler_test, max_pool2d_auto_scheduler_test, min_nm_auto_scheduler_test, @@ -73,9 +74,9 @@ def assert_has_cross_thread_reduction(state, stage_id): def test_cpu_matmul_sketch(): sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "llvm") """ 3 multi-level tiling sketches - 0 - Multi-level tiling - 1 - Multi-level tiling with cache write on position 0 - 2 - Multi-level tiling with cache write on position 1 + No.0 : Multi-level tiling + No.1 : Multi-level tiling with cache write on position 0 + No.2 : Multi-level tiling with cache write on position 1 """ assert len(sketches) == 3 # Sketch 0 @@ -92,11 +93,11 @@ def test_cpu_matmul_sketch(): sketches = generate_sketches(matmul_auto_scheduler_test, (8, 8, 512), "llvm") """ 2 rfactor sketches + 3 multi-level tiling sketches - 0 - Rfactor with factor position 0 - 1 - Rfactor with factor position 1 - 2 - Multi-level tiling - 3 - Multi-level tiling with cache write on position 0 - 4 - Multi-level tiling with cache write on position 1 + No.0 : Rfactor with factor position 0 + No.1 : Rfactor with factor position 1 + No.2 : Multi-level tiling + No.3 : Multi-level tiling with cache write on position 0 + No.4 : Multi-level tiling with cache write on position 1 """ assert len(sketches) == 5 # Sketch 0 @@ -116,15 +117,20 @@ def test_cpu_matmul_sketch(): assert_compute_at_condition(sketches[4].stages[2], "iter") assert sketches[3] != sketches[4] + sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "llvm") + """ 3 multi-level tiling sketches for one matmul, so 3 * 3 = 9 sketches in total """ + assert len(sketches) == 9 + assert_is_tiled(sketches[8].stages[5]) + def test_cpu_conv2d_bn_relu_sketch(): sketches = generate_sketches( conv2d_nchw_bn_relu_auto_scheduler_test, (1, 56, 56, 512, 512, 3, 1, 1), "llvm" ) """ 3 multi-level tiling sketches - 0 - Conv2d multi-level tiling with fusion on position 0 - 1 - Conv2d multi-level tiling with fusion on position 1 - 2 - Conv2d multi-level tiling without fusion + No.0 : Conv2d multi-level tiling with fusion on position 0 + No.1 : Conv2d multi-level tiling with fusion on position 1 + No.2 : Conv2d multi-level tiling without fusion """ assert len(sketches) == 3 # Sketch 0 @@ -164,9 +170,9 @@ def test_cpu_max_pool2d_sketch(): def test_cpu_min_sketch(): sketches = generate_sketches(min_nm_auto_scheduler_test, (10, 1024), "llvm") """ 2 rfactor sketches + 1 default sketch - 0 - Rfactor with factor position 0 - 1 - Rfactor with factor position 1 - 2 - Default sketch + No.0 : Rfactor with factor position 0 + No.1 : Rfactor with factor position 1 + No.2 : Default sketch """ assert len(sketches) == 3 # Sketch 0 @@ -209,9 +215,9 @@ def test_cpu_conv2d_winograd_sketch(): conv2d_winograd_nhwc_auto_scheduler_test, (1, 28, 28, 128, 128, 3, 1, 1), "llvm" ) """ 3 multi-level tiling sketches - 0 - Bgemm multi-level tiling - 1 - Bgemm multi-level tiling with cache write on position 0 - 2 - Bgemm multi-level tiling with cache write on position 1 + No.0 : Bgemm multi-level tiling + No.1 : Bgemm multi-level tiling with cache write on position 0 + No.2 : Bgemm multi-level tiling with cache write on position 1 """ assert len(sketches) == 3 # Sketch 0 @@ -277,6 +283,12 @@ def test_cuda_matmul_sketch(): assert_compute_at_condition(sketches[1].stages[4], "iter") assert_is_tiled(sketches[1].stages[5]) + sketches = generate_sketches(double_matmul_auto_scheduler_test, (512,), "cuda") + """ 1 multi-level tiling sketch for one matmul, so 1 x 1 = 1 sketch in total """ + assert len(sketches) == 1 + assert_compute_at_condition(sketches[0].stages[5], "root") + assert_compute_at_condition(sketches[0].stages[6], "iter") + @tvm.testing.requires_cuda def test_cuda_conv2d_bn_relu_sketch(): diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py new file mode 100644 index 000000000000..74b37754c913 --- /dev/null +++ b/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py @@ -0,0 +1,191 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +.. _auto-scheduler-conv-gpu: + +Auto-scheduling a convolution layer for GPU +=========================================== +**Author**: `Lianmin Zheng `_, \ + `Chengfan Jia `_ + + +Different from the existing :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any templates. +The auto-scheduler is template-free, so users only need to write the computation declaration without +any schedule commands or templates. +The auto-scheduler can automatically generate a large +search space and find a good schedule in the space. + +We use a convolution layer as an example in this tutorial. +""" + +import numpy as np +import tvm +from tvm import te, testing, auto_scheduler, topi +from tvm.topi.testing import conv2d_nchw_python + +###################################################################### +# Define the computation +# ^^^^^^^^^^^^^^^^^^^^^^ +# To begin with, let us define the computation of a convolution layer. +# The function should return the list of input/output tensors. +# From these tensors, the auto-scheduler can get the whole computational graph. + + +@auto_scheduler.register_workload +def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding): + data = te.placeholder((N, CI, H, W), name="data") + kernel = te.placeholder((CO, CI, KH, KW), name="kernel") + bias = te.placeholder((1, CO, 1, 1), name="bias") + conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") + out = topi.nn.relu(conv + bias) + return [data, kernel, bias, out] + + +###################################################################### +# Create the search task +# ^^^^^^^^^^^^^^^^^^^^^^ +# We then create a search task for the last convolution layer in the resnet. + +target = tvm.target.Target("cuda") + +# the last layer in resnet +N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) +task = auto_scheduler.create_task(conv2d_layer, (N, H, W, CO, CI, KH, KW, strides, padding), target) + +# Inspect the computational graph +print(task.compute_dag) + +###################################################################### +# Next, we set parameters for the auto-scheduler. These parameters +# mainly specify how we do the measurement during the search and auto-tuning. +# +# * :code:`measure_ctx` launches a different process for measurement. This +# provides an isolation. It can protect the master process from GPU crashes +# happended during measurement and avoid other runtime conflicts. +# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. +# This can warmup the GPU, which is necessary to get accurate measurement results. +# Typically, we recommend a value > 300 ms. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. +# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a +# good value for the search to converge. You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`. +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. + +measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=10, + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile("conv2d.json")], +) + +###################################################################### +# Run the search +# ^^^^^^^^^^^^^^ +# Now we get all inputs ready. Pretty simple, isn't it? +# We can kick off the search and let the auto-scheduler do its magic. +# After some measurement trials, it will return the best schedule it found. + +sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option) + +###################################################################### +# We can lower the schedule to see the IR after auto-scheduling. +# The auto-scheduler correctly performs optimizations including multi-level tiling, +# cooperative fetching, unrolling and operator fusion. + +print(tvm.lower(sch, args, simple_mode=True)) + +###################################################################### +# Check correctness and evaluate performance +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# We build the binary and check its correctness and performance. + +func = tvm.build(sch, args, target) + +# check correctness +data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) +weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) +bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32) +conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) +out_np = np.maximum(conv_np + bias_np, 0.0) + +ctx = tvm.gpu() +data_tvm = tvm.nd.array(data_np, ctx=ctx) +weight_tvm = tvm.nd.array(weight_np, ctx=ctx) +bias_tvm = tvm.nd.array(bias_np, ctx=ctx) +out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) +func(data_tvm, weight_tvm, bias_tvm, out_tvm) + +# Check results +tvm.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) + +# Evaluate execution time +evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +print( + "Execution time of this operator: %.3f ms" + % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000) +) + +###################################################################### +# Using the record file +# ^^^^^^^^^^^^^^^^^^^^^ +# During the search, all measuremnt records are dumpped into the record +# file "conv2d.json". The measurement records can be used to re-apply search results, +# resume the search, and perform other analyses. + +###################################################################### +# Here is an example where we load the best schedule from a file, +# print the equivalent python schedule API, and build the binary again. + +# Load the measuremnt record for the best schedule +inp, res = auto_scheduler.load_best("conv2d.json", task.workload_key) + +# Print equivalent python schedule API. This can be used for debugging and +# learning the behavior of the auto-scheduler. +print("Equivalent python schedule:") +print(task.compute_dag.print_python_code_from_state(inp.state)) + +# Rebuild the binary. This shows how you can apply the best schedule from a +# log file without reruning the search again. +sch, args = task.compute_dag.apply_steps_from_state(inp.state) +func = tvm.build(sch, args, target) + +###################################################################### +# A more complicated example is to resume the search. +# In this case, we need to create the search policy and cost model by ourselves +# and resume the status of search policy and cost model with the log file. +# In the example below we resume the status and do more 5 trials. + + +log_file = "conv2d.json" +cost_model = auto_scheduler.XGBModel() +cost_model.update_from_file(log_file) +search_policy = auto_scheduler.SketchPolicy( + task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] +) +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=5, + runner=measure_ctx.runner, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], +) +sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option) + +# kill the measurement process +del measure_ctx diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py index 1a9af42510eb..e5f9d7e9a27d 100644 --- a/tutorials/auto_scheduler/tune_matmul_x86.py +++ b/tutorials/auto_scheduler/tune_matmul_x86.py @@ -37,7 +37,7 @@ ###################################################################### # Define the computation # ^^^^^^^^^^^^^^^^^^^^^^ -# To begin with, we define the computation of a matmul with bias add. +# To begin with, let us define the computation of a matmul with bias add. # The function should return the list of input/output tensors. # From these tensors, the auto-scheduler can get the whole computational graph. @@ -59,9 +59,13 @@ def matmul_add(N, L, M, dtype): # Create the search task # ^^^^^^^^^^^^^^^^^^^^^^ # We then create a search task with N=L=M=128 and dtype="float32" +# If your machine supports avx instructions, you can +# +# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 +# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 target = tvm.target.Target("llvm") -task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target) +task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target) # Inspect the computational graph print(task.compute_dag) @@ -69,13 +73,13 @@ def matmul_add(N, L, M, dtype): ###################################################################### # Next, we set parameters for the auto-scheduler. # -# * `num_measure_trials` is the number of measurement trials we can use during the search. +# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. # We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a # good value for the search to converge. You can do more trials according to your time budget. -# * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`. # The measurement records can be used to query the history best, resume the search, # and do more analyses later. -# * see :any:`auto_schedule.TuningOptions`: for more parameters +# * see :any:`auto_scheduler.TuningOptions` for more parameters tune_option = auto_scheduler.TuningOptions( num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")] @@ -93,25 +97,38 @@ def matmul_add(N, L, M, dtype): ###################################################################### # We can lower the schedule to see the IR after auto-scheduling. # The auto-scheduler correctly performs optimizations including multi-level tiling, -# parallelization, vectorization, unrolling and fusion. +# parallelization, vectorization, unrolling and operator fusion. print(tvm.lower(sch, args, simple_mode=True)) ###################################################################### -# Check correctness -# ^^^^^^^^^^^^^^^^^ -# We build the binary and check its correctness +# Check correctness and evaluate performance +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# We build the binary and check its correctness and performance. func = tvm.build(sch, args) a_np = np.random.uniform(size=(128, 128)).astype(np.float32) b_np = np.random.uniform(size=(128, 128)).astype(np.float32) c_np = np.random.uniform(size=(128, 128)).astype(np.float32) -d_np = a_np.dot(b_np) + c_np - -d_tvm = tvm.nd.empty(d_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm) +out_np = a_np.dot(b_np) + c_np + +ctx = tvm.cpu() +a_tvm = tvm.nd.array(a_np, ctx=ctx) +b_tvm = tvm.nd.array(b_np, ctx=ctx) +c_tvm = tvm.nd.array(c_np, ctx=ctx) +out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) +func(a_tvm, b_tvm, c_tvm, out_tvm) + +# Check results +tvm.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) + +# Evaluate execution time. +evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +print( + "Execution time of this operator: %.3f ms" + % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000) +) -tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-3) ###################################################################### # Using the record file @@ -129,6 +146,7 @@ def matmul_add(N, L, M, dtype): # Print equivalent python schedule API. This can be used for debugging and # learning the behavior of the auto-scheduler. +print("Equivalent python schedule:") print(task.compute_dag.print_python_code_from_state(inp.state)) # Rebuild the binary. This shows how you can apply the best schedule from a @@ -161,13 +179,16 @@ def resume_search(task, log_file): # .. note:: # We cannot run the line above because of the conflict between # python's multiprocessing and tvm's thread pool. -# After running a tvm generated binary (L112), the python's multiprocessing -# library will hang forever. -# You have to make sure that you don't run any tvm generated binaries before -# calling ansor's search. To run the L156 above, you should comment out L112-114. +# After running a tvm generated binary the python's multiprocessing library +# will hang forever. You have to make sure that you don't run any tvm +# generated binaries before calling auot-scheduler's search. +# To run the function above, you should comment out all code in +# "Check correctness and evaluate performance" section. # # You should be careful about this problem in your applications. # There are other workarounds for this problem. # For example, you can start a new thread/process (with the builtin python library # threading or multiprocessing) and run the tvm binaries in the new thread/process. # This provides an isolation and avoids the conflict in the main thread/process. +# You can also use :any:`auto_scheduler.LocalRPCMeasureContext` for auto-scheduler, +# as shown in the GPU tutorial (:ref:`auto-scheduler-conv-gpu`). diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py index bbfb410764a5..4cbafaf3c14a 100644 --- a/tutorials/frontend/from_darknet.py +++ b/tutorials/frontend/from_darknet.py @@ -195,6 +195,7 @@ names = [x.strip() for x in content] +tvm.relay.testing.yolo_detection.show_detections(img, dets, thresh, names, last_layer.classes) tvm.relay.testing.yolo_detection.draw_detections( font_path, img, dets, thresh, names, last_layer.classes ) diff --git a/tutorials/get_started/tensor_expr_get_started.py b/tutorials/get_started/tensor_expr_get_started.py index 76e02625ef04..0ca2243c7c3b 100644 --- a/tutorials/get_started/tensor_expr_get_started.py +++ b/tutorials/get_started/tensor_expr_get_started.py @@ -30,6 +30,7 @@ from __future__ import absolute_import, print_function import tvm +import tvm.testing from tvm import te import numpy as np diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py index ecefc28776b7..cffa10e6cbb2 100644 --- a/tutorials/language/reduction.py +++ b/tutorials/language/reduction.py @@ -28,6 +28,7 @@ from __future__ import absolute_import, print_function import tvm +import tvm.testing from tvm import te import numpy as np diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py index fdb6ec99eae6..5f513208d56d 100644 --- a/tutorials/language/scan.py +++ b/tutorials/language/scan.py @@ -25,6 +25,7 @@ from __future__ import absolute_import, print_function import tvm +import tvm.testing from tvm import te import numpy as np diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py index c9812fff6d2c..2459cf249506 100644 --- a/tutorials/topi/intro_topi.py +++ b/tutorials/topi/intro_topi.py @@ -26,6 +26,7 @@ from __future__ import absolute_import, print_function import tvm +import tvm.testing from tvm import te from tvm import topi import numpy as np diff --git a/web/package.json b/web/package.json index 25fca5088e78..1f52a0781e60 100644 --- a/web/package.json +++ b/web/package.json @@ -20,7 +20,7 @@ "@types/node": "^12.12.37", "@typescript-eslint/eslint-plugin": "^2.29.0", "@typescript-eslint/parser": "^2.29.0", - "@webgpu/types": "^0.0.24", + "@webgpu/types": "^0.0.31", "eslint": "^6.8.0", "jest": "^26.0.1", "rollup": "^2.7.6", diff --git a/web/src/rpc_server.ts b/web/src/rpc_server.ts index 542558aa157f..c63dcf3a9ae3 100644 --- a/web/src/rpc_server.ts +++ b/web/src/rpc_server.ts @@ -252,8 +252,8 @@ export class RPCServer { this.logger ); try { - const gpuDevice: GPUDevice | undefined = await detectGPUDevice(); - if (gpuDevice !== undefined) { + const gpuDevice: GPUDevice | undefined | null = await detectGPUDevice(); + if (gpuDevice !== undefined && gpuDevice !== null) { const label = gpuDevice.label?.toString() || "WebGPU"; this.log("Initialize GPU device: " + label); inst.initWebGPU(gpuDevice); diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts index 640f7b4a7163..f12837f421f8 100644 --- a/web/src/webgpu.ts +++ b/web/src/webgpu.ts @@ -27,10 +27,10 @@ export type GPUPointer = number; /** * DetectGPU device in the environment. */ -export async function detectGPUDevice(): Promise { +export async function detectGPUDevice(): Promise { if (typeof navigator !== "undefined" && navigator.gpu !== undefined) { const adapter = await navigator.gpu.requestAdapter(); - return await adapter.requestDevice(); + return await adapter?.requestDevice(); } else { return undefined; } @@ -235,11 +235,14 @@ export class WebGPUContext { nbytes: number ): void { // Perhaps it would be more useful to use a staging buffer? - const [gpuTemp, cpuTemp] = this.device.createBufferMapped({ + const gpuTemp = this.device.createBuffer({ + mappedAtCreation: true, size: nbytes, - usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC, + usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.COPY_SRC }); + const cpuTemp = gpuTemp.getMappedRange(); + const viewU8 = new Uint8Array(cpuTemp); viewU8.set(this.memory.loadRawBytes(from, nbytes)); gpuTemp.unmap(); @@ -281,8 +284,9 @@ export class WebGPUContext { this.device.defaultQueue.submit([copyCommands]); this.numPendingReads += 1; - const readEvent = gpuTemp.mapReadAsync().then((data: ArrayBuffer) => { - this.memory.storeRawBytes(to, new Uint8Array(data)); + + const readEvent = gpuTemp.mapAsync(GPUMapMode.READ).then((data: unknown) => { + this.memory.storeRawBytes(to, new Uint8Array(data as ArrayBuffer)); this.numPendingReads -= 1; gpuTemp.destroy(); });