diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index 178b8fedbd3..205224541a2 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -197,7 +197,7 @@ generate_functional_api_and_pybind11_cpp(FUNCTIONAL_GENERATED_SRCS FUNCTIONAL_GE FUNCTIONAL_PYBIND11_SRCS ${PROJECT_SOURCE_DIR}) oneflow_add_library(of_functional_obj STATIC ${FUNCTIONAL_GENERATED_SRCS} ${FUNCTIONAL_GENERATED_HRCS}) -target_link_libraries(of_functional_obj glog::glog) +target_link_libraries(of_functional_obj LLVMSupportWithHeader glog::glog) add_dependencies(of_functional_obj prepare_oneflow_third_party) if(BUILD_PYTHON) @@ -214,7 +214,7 @@ if(BUILD_PYTHON) of_functional_tensor_obj STATIC ${FUNCTIONAL_TENSOR_GENERATED_SRCS} ${FUNCTIONAL_TENSOR_GENERATED_HRCS} ${FUNCTIONAL_OPS_GENERATED_SRCS} ${FUNCTIONAL_OPS_GENERATED_HRCS}) - target_link_libraries(of_functional_tensor_obj glog::glog) + target_link_libraries(of_functional_tensor_obj LLVMSupportWithHeader glog::glog) add_dependencies(of_functional_tensor_obj prepare_oneflow_third_party) target_include_directories(of_functional_tensor_obj PRIVATE ${Python_INCLUDE_DIRS} ${Python_NumPy_INCLUDE_DIRS}) @@ -274,6 +274,22 @@ if(WITH_MLIR) set(ONEFLOW_MLIR_LIBS -Wl,--no-as-needed MLIROneFlowExtension -Wl,--as-needed) endif() +if("${LLVM_PROVIDER}" STREQUAL "install") + get_property(LLVM_INSTALL_DIR GLOBAL PROPERTY LLVM_INSTALL_DIR) + check_variable_defined(LLVM_INSTALL_DIR) + find_library(LLVMSupportLib LLVMSupport PATHS ${LLVM_INSTALL_DIR}/lib REQUIRED) + add_library(LLVMSupportWithHeader UNKNOWN IMPORTED) + set_property(TARGET LLVMSupportWithHeader PROPERTY IMPORTED_LOCATION ${LLVMSupportLib}) +else() + add_library(LLVMSupportWithHeader INTERFACE IMPORTED) + target_link_libraries(LLVMSupportWithHeader INTERFACE LLVMSupport) +endif() +check_variable_defined(LLVM_INCLUDE_DIRS) +set_property(TARGET LLVMSupportWithHeader PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${LLVM_INCLUDE_DIRS}) + +list(APPEND oneflow_third_party_libs LLVMSupportWithHeader) + include(op_schema) get_property(EXTERNAL_INCLUDE_DIRS GLOBAL PROPERTY EXTERNAL_INCLUDE_DIRS) diff --git a/cmake/op_schema.cmake b/cmake/op_schema.cmake index ce790c1918b..5017fab574e 100644 --- a/cmake/op_schema.cmake +++ b/cmake/op_schema.cmake @@ -81,5 +81,5 @@ set_source_files_properties(${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP} TRUE) oneflow_add_library(of_op_schema OBJECT ${GENERATED_OP_SCHEMA_H} ${GENERATED_OP_SCHEMA_CPP}) -target_link_libraries(of_op_schema glog::glog) +target_link_libraries(of_op_schema LLVMSupportWithHeader glog::glog) add_dependencies(of_op_schema prepare_oneflow_third_party) diff --git a/cmake/util.cmake b/cmake/util.cmake index 3aaae830e12..a69128f416e 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -269,6 +269,12 @@ function(set_compile_options_to_oneflow_target target) endif() endfunction() +function(check_variable_defined variable) + if(NOT DEFINED ${variable}) + message(FATAL_ERROR "Variable ${variable} is not defined") + endif() +endfunction() + function(checkDirAndAppendSlash) set(singleValues DIR;OUTPUT) set(prefix ARG) diff --git a/docs/source/graph.rst b/docs/source/graph.rst index 5ec08061a8a..270e5a01cf0 100644 --- a/docs/source/graph.rst +++ b/docs/source/graph.rst @@ -20,12 +20,11 @@ Base class for running neural networks in Static Graph Mode. .. autoclass:: oneflow.nn.graph.graph_config.GraphConfig :members: enable_amp, + enable_zero, allow_fuse_model_update_ops, allow_fuse_add_to_output, allow_fuse_cast_scale, set_gradient_accumulation_steps, - set_zero_redundancy_optimizer_mode, - set_zero_redundancy_optimizer_min_size_after_split, enable_cudnn_conv_heuristic_search_algo, :member-order: bysource diff --git a/oneflow/api/python/framework/nn_graph.cpp b/oneflow/api/python/framework/nn_graph.cpp index e02d86e9eb1..9e0c939b3e2 100644 --- a/oneflow/api/python/framework/nn_graph.cpp +++ b/oneflow/api/python/framework/nn_graph.cpp @@ -80,12 +80,18 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) { m.def("RunLazyNNGraph", &RunLazyNNGraph); m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers); m.def("AddTensorAsGraphLoss", &AddTensorAsGraphLoss); + m.def("ConvertJobToTosaIR", [](const std::string& serialized_job) -> Maybe { + Job job; + CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)) + << "serialized job conversion failed."; + return ConvertJobToTosaIR(&job); + }); m.def("SaveJobToIR", [](const std::string& serialized_job, const std::string& path) -> Maybe { Job job; - CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)); + CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job)) + << "serialized job conversion failed."; return SaveJobToIR(&job, path); - ; }); m.def("LoadSerializedJobFromIR", [](const std::string& path) -> Maybe { Job job; diff --git a/oneflow/core/autograd/gradient_funcs/normalization.cpp b/oneflow/core/autograd/gradient_funcs/normalization.cpp index e336edf519d..c12fcb60442 100644 --- a/oneflow/core/autograd/gradient_funcs/normalization.cpp +++ b/oneflow/core/autograd/gradient_funcs/normalization.cpp @@ -136,15 +136,14 @@ class NormalizationGrad : public OpExprGradFunction::Ok(); } - DimVector dim_vec; + Shape shape; for (int i = 0; i < x->shape()->NumAxes(); ++i) { if (i != ctx->axis) { - dim_vec.emplace_back(1); + shape.emplace_back(1); } else { - dim_vec.emplace_back(x->shape()->At(ctx->axis)); + shape.emplace_back(x->shape()->At(ctx->axis)); } } - Shape shape(dim_vec); const auto& reshaped_gamma = JUST(functional::Reshape(gamma, shape)); const auto& reshaped_inv_variance = JUST(functional::Reshape(inv_variance, shape)); diff --git a/oneflow/core/common/fixed_vector.h b/oneflow/core/common/fixed_vector.h deleted file mode 100644 index b3d1c98c827..00000000000 --- a/oneflow/core/common/fixed_vector.h +++ /dev/null @@ -1,277 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ -#define ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ - -#include -#include -#include -#include - -namespace oneflow { - -template -using RequireInputIter = typename std::enable_if< - std::is_convertible::iterator_category, - std::input_iterator_tag>::value>::type; - -template -class fixed_vector final { - public: - using value_type = T; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; - using reference = value_type&; - using const_reference = const value_type&; - using pointer = T*; - using const_pointer = const T*; - using iterator = T*; - using const_iterator = const T*; - using reverse_iterator = std::reverse_iterator; - using const_reverse_iterator = std::reverse_iterator; - - fixed_vector() : size_(0) {} - explicit fixed_vector(size_t size) { assign(size, T()); } - explicit fixed_vector(size_t size, const T& val) { assign(size, val); } - template> - fixed_vector(InputIt first, InputIt last) { - assign(first, last); - } - fixed_vector(const fixed_vector& rhs) { *this = rhs; } - fixed_vector(fixed_vector&& rhs) { *this = std::move(rhs); } - fixed_vector(std::initializer_list rhs) { assign(rhs); } - ~fixed_vector() = default; - - fixed_vector& operator=(const fixed_vector& rhs) { - size_ = rhs.size(); - CheckSize(); - std::copy(rhs.begin(), rhs.end(), begin()); - return *this; - } - fixed_vector& operator=(fixed_vector&& rhs) noexcept { - size_ = rhs.size(); - CheckSize(); - std::copy(rhs.begin(), rhs.end(), begin()); - return *this; - } - fixed_vector& operator=(std::initializer_list ilist) { - size_ = ilist.size(); - assign(ilist); - return *this; - } - void assign(size_type count, const value_type& value) { - size_ = count; - CheckSize(); - std::fill(begin(), begin() + size_, value); - } - template> - void assign(InputIt first, InputIt last) { - size_ = last - first; - CheckSize(); - std::copy(first, last, begin()); - } - void assign(std::initializer_list ilist) { - size_ = ilist.size(); - CheckSize(); - std::copy(ilist.begin(), ilist.end(), begin()); - } - - reference at(size_type pos) { - CheckPos(pos); - return data_.at(pos); - } - const_reference at(size_type pos) const { - CheckPos(pos); - return data_.at(pos); - } - - reference operator[](size_type pos) { - CheckPos(pos); - return data_[pos]; - } - const_reference operator[](size_type pos) const { - CheckPos(pos); - return data_[pos]; - } - - reference front() { - CheckPos(0); - return data_.at(0); - } - const_reference front() const { - CheckPos(0); - return data_.at(0); - } - - reference back() { - CheckPos(0); - return data_.at(size_ - 1); - } - const_reference back() const { - CheckPos(0); - return data_.at(size_ - 1); - } - - T* data() noexcept { return data_.data(); } - const T* data() const noexcept { return data_.data(); } - - iterator begin() noexcept { return data_.data(); } - const_iterator begin() const noexcept { return data_.data(); } - const_iterator cbegin() const noexcept { return data_.data(); } - - iterator end() noexcept { return data_.data() + size_; } - const_iterator end() const noexcept { return data_.data() + size_; } - const_iterator cend() const noexcept { return data_.data() + size_; } - - reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } - const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(cend()); } - - reverse_iterator rend() noexcept { return reverse_iterator(begin()); } - const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } - const_reverse_iterator crend() const noexcept { return const_reverse_iterator(cbegin()); } - - bool empty() const noexcept { return size_ == 0; } - - size_type size() const noexcept { return size_; } - - size_type max_size() const noexcept { return kMaxSize; } - - size_type capacity() const noexcept { return kMaxSize; } - - void clear() noexcept { size_ = 0; } - - iterator insert(iterator pos, const T& value) { - MoveNToEnd(pos, 1); - *pos = value; - return pos; - } - iterator insert(iterator pos, T&& value) { - MoveNToEnd(pos, 1); - *pos = std::move(value); - return pos; - } - iterator insert(iterator pos, size_type count, const T& value) { - MoveNToEnd(pos, count); - std::fill(pos, pos + count, value); - return pos; - } - template> - void insert(iterator pos, InputIt first, InputIt last) { - MoveNToEnd(pos, last - first); - std::copy(first, last, pos); - } - iterator insert(iterator pos, std::initializer_list ilist) { - MoveNToEnd(pos, ilist.size()); - std::copy(ilist.begin(), ilist.end(), pos); - return pos; - } - - template - iterator emplace(iterator pos, Args&&... args) { - MoveNToEnd(pos, 1); - new (&*pos) T(std::forward(args)...); - return pos; - } - - iterator erase(iterator pos) { - MoveNToBegin(pos + 1, 1); - return pos; - } - iterator erase(iterator first, iterator last) { - if (first >= last) { return last; } - MoveNToBegin(last, last - first); - return first; - } - - void push_back(const T& value) { insert(end(), value); } - void push_back(T&& value) { insert(end(), std::move(value)); } - void emplace_back(const T& value) { insert(end(), value); } - template - void emplace_back(Args&&... args) { - insert(end(), std::forward(args)...); - } - - void pop_back() { --size_; } - - void resize(size_type count) { resize(count, T()); } - void resize(size_type count, const value_type& value) { - if (count == size_) { return; } - if (count < size_) { - erase(begin() + count, end()); - return; - } - insert(end(), count - size_, value); - } - - void swap(fixed_vector& rhs) noexcept { - fixed_vector tmp; - tmp = rhs; - rhs = *this; - *this = tmp; - } - - bool operator==(const fixed_vector& rhs) const { - if (size() != rhs.size()) { return false; } - return std::equal(begin(), end(), rhs.begin()); - } - - bool operator!=(const fixed_vector& rhs) const { return !(*this == rhs); } - - bool operator>=(const fixed_vector& rhs) const { return !(*this < rhs); } - - bool operator<=(const fixed_vector& rhs) const { return !(*this > rhs); } - - bool operator>(const fixed_vector& rhs) const { - return std::lexicographical_compare(rhs.begin(), rhs.end(), begin(), end()); - } - - bool operator<(const fixed_vector& rhs) const { - return std::lexicographical_compare(begin(), end(), rhs.begin(), rhs.end()); - } - - private: - void CheckSize() const { CheckSize(size_); } - void CheckSize(size_t size) const { CHECK_LE(size, kMaxSize); } - void CheckPos(size_t pos) const { CHECK_LE(pos, size_); } - void MoveNToEnd(iterator first, size_t N) { - CheckSize(size_ + N); - iterator old_end = end(); - size_ += N; - iterator new_end = end(); - std::copy_backward(first, old_end, new_end); - } - void MoveNToBegin(iterator last, size_t N) { - CheckPos(last - N - begin()); - iterator old_end = end(); - size_ -= N; - std::copy(last, old_end, last - N); - } - - size_t size_; - std::array data_; -}; - -template -void swap(fixed_vector& lhs, fixed_vector& rhs) { - return lhs.swap(rhs); -} - -#define SHAPE_MAX_AXIS_SIZE 20 - -} // namespace oneflow - -#endif // ONEFLOW_CORE_COMMON_FIXED_VECTOR_H_ diff --git a/oneflow/core/common/fixed_vector_test.cpp b/oneflow/core/common/fixed_vector_test.cpp deleted file mode 100644 index cb79b7510e0..00000000000 --- a/oneflow/core/common/fixed_vector_test.cpp +++ /dev/null @@ -1,419 +0,0 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/fixed_vector.h" -#include "gtest/gtest.h" -#include -#include - -namespace oneflow { - -namespace test { - -using FixedVec = fixed_vector; - -TEST(fixed_vector, constructor_0) { - FixedVec a(8); - ASSERT_EQ(a.size(), 8); -} - -TEST(fixed_vector, constructor_1) { - int value = 30; - FixedVec a(8, value); - ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; })); -} - -TEST(fixed_vector, constructor_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a(vec.begin(), vec.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_3) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a(b); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_4) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a(std::move(b)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, constructor_5) { - std::vector vec{1, 2, 3, 4}; - FixedVec a{1, 2, 3, 4}; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_0) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a; - a = b; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_1) { - std::vector vec{1, 2, 3, 4}; - FixedVec b(vec.begin(), vec.end()); - FixedVec a; - a = std::move(b); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, operator_assign_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a = {1, 2, 3, 4}; - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, assign_0) { - int value = 30; - FixedVec a; - a.assign(8, value); - ASSERT_TRUE(std::all_of(a.begin(), a.end(), [value](const int x) { return x == value; })); -} - -TEST(fixed_vector, assign_1) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a.assign(vec.begin(), vec.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, assign_2) { - std::vector vec{1, 2, 3, 4}; - FixedVec a; - a.assign({1, 2, 3, 4}); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, const_at) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(a.at(0), value); -} - -TEST(fixed_vector, at) { - int value = 33; - FixedVec a{0}; - a.at(0) = value; - ASSERT_EQ(a.at(0), value); -} - -TEST(fixed_vector, const_front) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(a.front(), value); -} - -TEST(fixed_vector, front) { - int value = 33; - FixedVec a{0}; - a.front() = value; - ASSERT_EQ(a.front(), value); -} - -TEST(fixed_vector, const_back) { - int value = 33; - const FixedVec a{1, value}; - ASSERT_EQ(a.back(), value); -} - -TEST(fixed_vector, back) { - int value = 33; - FixedVec a{1, 0}; - a.back() = value; - ASSERT_EQ(a.back(), value); -} - -TEST(fixed_vector, const_data) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(*a.data(), value); -} - -TEST(fixed_vector, data) { - int value = 33; - FixedVec a{0}; - *a.data() = value; - ASSERT_EQ(*a.data(), value); -} - -TEST(fixed_vector, const_begin) { - int value = 33; - const FixedVec a{value}; - ASSERT_EQ(*a.begin(), value); -} - -TEST(fixed_vector, begin) { - int value = 33; - FixedVec a{0}; - *a.begin() = value; - ASSERT_EQ(*a.begin(), value); -} - -TEST(fixed_vector, cbegin) { - int value = 33; - FixedVec a{value}; - ASSERT_EQ(*a.cbegin(), value); -} - -TEST(fixed_vector, const_end) { - const FixedVec a{0, 1, 2}; - ASSERT_EQ(a.begin() + a.size(), a.end()); -} - -TEST(fixed_vector, end) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.begin() + a.size(), a.end()); -} - -TEST(fixed_vector, cend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.cbegin() + a.size(), a.cend()); -} - -TEST(fixed_vector, const_rbegin) { - int value = 33; - const FixedVec a{0, value}; - ASSERT_EQ(*a.rbegin(), value); -} - -TEST(fixed_vector, rbegin) { - int value = 33; - FixedVec a{0, 0}; - *a.rbegin() = value; - ASSERT_EQ(*a.rbegin(), value); -} - -TEST(fixed_vector, crbegin) { - int value = 33; - FixedVec a{0, value}; - ASSERT_EQ(*a.crbegin(), value); -} - -TEST(fixed_vector, const_rend) { - const FixedVec a{0, 1, 2}; - ASSERT_EQ(a.rbegin() + a.size(), a.rend()); -} - -TEST(fixed_vector, rend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.rbegin() + a.size(), a.rend()); -} - -TEST(fixed_vector, crend) { - FixedVec a{0, 1, 2}; - ASSERT_EQ(a.crbegin() + a.size(), a.crend()); -} - -TEST(fixed_vector, insert_0) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), 0); - a.insert(a.end(), 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_1) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - int zero = 0; - int three = 3; - a.insert(a.begin(), std::move(zero)); - a.insert(a.end(), std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_2) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), 2, 0); - a.insert(a.end(), 2, 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_3) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - int zero = 0; - int three = 3; - a.insert(a.begin(), 2, std::move(zero)); - a.insert(a.end(), 2, std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_4) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - std::vector zeros{0, 0}; - std::vector threes{3, 3}; - a.insert(a.begin(), zeros.begin(), zeros.end()); - a.insert(a.end(), threes.begin(), threes.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, insert_5) { - std::vector vec{0, 0, 1, 2, 3, 3}; - FixedVec a{1, 2}; - a.insert(a.begin(), {0, 0}); - a.insert(a.end(), {3, 3}); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, emplace) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{1, 2}; - a.emplace(a.begin(), 0); - a.emplace(a.end(), 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, erase_0) { - std::vector vec{1, 2}; - FixedVec a{0, 1, 2, 3}; - a.erase(a.begin()); - a.erase(a.end() - 1); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, erase_1) { - std::vector vec{1, 2}; - FixedVec a{0, 0, 1, 2, 3, 3}; - a.erase(a.begin(), a.begin() + 2); - a.erase(a.end() - 2, a.end()); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, push_back_0) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - a.emplace_back(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, push_back_1) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - int three = 3; - a.emplace_back(std::move(three)); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, emplace_back) { - std::vector vec{0, 1, 2, 3}; - FixedVec a{0, 1, 2}; - a.emplace_back(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, pop_back) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2, 3}; - a.pop_back(); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_0) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2}; - a.resize(3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_1) { - std::vector vec{0, 1, 2}; - FixedVec a{0, 1, 2}; - a.resize(3, 9527); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_2) { - std::vector vec{0}; - FixedVec a{0, 1, 2}; - a.resize(1); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_3) { - std::vector vec{0}; - FixedVec a{0, 1, 2}; - a.resize(1, 9527); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_4) { - std::vector vec{0, 1, 2, 0, 0}; - FixedVec a{0, 1, 2}; - a.resize(5); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, resize_5) { - std::vector vec{0, 1, 2, 3, 3}; - FixedVec a{0, 1, 2}; - a.resize(5, 3); - ASSERT_TRUE(std::equal(a.begin(), a.end(), vec.begin())); -} - -TEST(fixed_vector, swap) { - std::vector vec_0{0, 1, 2, 0, 0}; - std::vector vec_1{0, 1, 2, 3, 3}; - FixedVec a_0(vec_1.begin(), vec_1.end()); - FixedVec a_1(vec_0.begin(), vec_0.end()); - a_0.swap(a_1); - ASSERT_TRUE(std::equal(a_0.begin(), a_0.end(), vec_0.begin())); - ASSERT_TRUE(std::equal(a_1.begin(), a_1.end(), vec_1.begin())); -} - -void WithTwoVector(std::function&, const std::vector&)> Handler) { - std::vector a{0, 1, 2, 3, 4}; - std::vector b{0, 1, 2, 3}; - std::vector c{4, 3, 2}; - Handler(a, a); - Handler(a, b); - Handler(a, c); - Handler(b, a); - Handler(b, b); - Handler(b, c); - Handler(c, a); - Handler(c, b); - Handler(c, c); -} - -#define TEST_LOGICAL_OPERATOR(test_name, logical_op) \ - TEST(fixed_vector, test_name) { \ - WithTwoVector([](const std::vector& lhs, const std::vector& rhs) { \ - ASSERT_EQ((lhs logical_op rhs), \ - (FixedVec(lhs.begin(), lhs.end()) logical_op FixedVec(rhs.begin(), rhs.end()))); \ - }); \ - } - -TEST_LOGICAL_OPERATOR(eq, ==); -TEST_LOGICAL_OPERATOR(ne, !=); -TEST_LOGICAL_OPERATOR(gt, >); -TEST_LOGICAL_OPERATOR(ge, >=); -TEST_LOGICAL_OPERATOR(lt, <); -TEST_LOGICAL_OPERATOR(le, <=); - -} // namespace test - -} // namespace oneflow diff --git a/oneflow/core/common/shape.cpp b/oneflow/core/common/shape.cpp index 2286cac3290..5d9b5a96a35 100644 --- a/oneflow/core/common/shape.cpp +++ b/oneflow/core/common/shape.cpp @@ -59,50 +59,32 @@ int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes) { return axis; } -Shape::Shape(const std::initializer_list& dim_vec) - : dim_vec_(dim_vec), is_initialized_(true) {} -Shape::Shape(const DimVector& dim_vec) : dim_vec_(dim_vec), is_initialized_(true) {} -Shape::Shape(DimVector&& dim_vec) : dim_vec_(std::move(dim_vec)), is_initialized_(true) {} -Shape::Shape(const ShapeProto& shape_proto) : is_initialized_(true) { - dim_vec_.assign(shape_proto.dim().begin(), shape_proto.dim().end()); -} - -Shape& Shape::operator=(const Shape& shape) { - dim_vec_ = shape.dim_vec_; - is_initialized_ = shape.is_initialized_; - return *this; -} - -Shape& Shape::assign(const DimVector& dim_vec) { - dim_vec_ = dim_vec; - is_initialized_ = true; - return *this; -} +Shape::Shape(const DimVector& dim_vec) : DimVector(dim_vec), is_initialized_(true) {} +Shape::Shape(DimVector&& dim_vec) : DimVector(std::move(dim_vec)), is_initialized_(true) {} +Shape::Shape(const ShapeProto& shape_proto) + : DimVector(shape_proto.dim().begin(), shape_proto.dim().end()), is_initialized_(true) {} Shape& Shape::CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view) { CHECK_EQ(NumAxes(), shape_view.NumAxes()); - std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), dim_vec_.data()); + std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data()); return *this; } Shape& Shape::LeftOnesExtendedAssign(const ShapeView& shape_view) { CHECK_GE(NumAxes(), shape_view.NumAxes()); size_t left_ones_size = NumAxes() - shape_view.NumAxes(); - FOR_RANGE(int, i, 0, left_ones_size) { dim_vec_.at(i) = 1LL; } - std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), - dim_vec_.data() + left_ones_size); + FOR_RANGE(int, i, 0, left_ones_size) { (*this)[i] = 1LL; } + std::copy(shape_view.ptr(), shape_view.ptr() + shape_view.NumAxes(), data() + left_ones_size); return *this; } -bool Shape::operator==(const Shape& rhs) const { return dim_vec_ == rhs.dim_vec_; } - std::string Shape::ToString() const { std::stringstream ss; int32_t idx = 0; ss << "("; - for (int64_t dim : dim_vec_) { + for (int64_t dim : *this) { ss << dim; - if (++idx != dim_vec_.size() || dim_vec_.size() == 1) { ss << ","; } + if (++idx != size() || size() == 1) { ss << ","; } } ss << ")"; return ss.str(); @@ -111,21 +93,21 @@ std::string Shape::ToString() const { std::string Shape::DebugStr() const { return ToString(); } void Shape::ToProto(ShapeProto* ret) const { - *(ret->mutable_dim()) = PbRf(dim_vec_.begin(), dim_vec_.end()); + *(ret->mutable_dim()) = PbRf(begin(), end()); } int64_t Shape::At(int64_t index) const { CHECK_GE(index, 0); CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index << " > num_axes: " << this->NumAxes(); - return dim_vec_.at(index); + return (*this)[index]; } void Shape::Set(int64_t index, int64_t val) { CHECK_GE(index, 0); CHECK_LT(index, this->NumAxes()) << " Shape: " << DebugStr() << " visit index: " << index << " > num_axes: " << this->NumAxes(); - dim_vec_.at(index) = val; + (*this)[index] = val; } int64_t Shape::Count(int64_t begin_axis, int64_t end_axis) const { @@ -206,9 +188,9 @@ Maybe Shape::Slice(int64_t start_dim, int64_t end_dim) const { int64_t ndims = this->NumAxes(); if (start_dim > ndims) { start_dim = ndims; } if (end_dim > ndims) { end_dim = ndims; } - DimVector dim_vec; - for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { dim_vec.emplace_back(this->At(i)); } - return std::make_shared(dim_vec); + std::shared_ptr shape = std::make_shared(); + for (int64_t i = start_dim; i < end_dim && i < ndims; ++i) { shape->emplace_back(this->At(i)); } + return shape; } } // namespace oneflow diff --git a/oneflow/core/common/shape.h b/oneflow/core/common/shape.h index 408a46c57ca..7a94ad85a6d 100644 --- a/oneflow/core/common/shape.h +++ b/oneflow/core/common/shape.h @@ -32,22 +32,37 @@ namespace cfg { class ShapeProto; } // namespace cfg -class Shape final { +class Shape final : public DimVector { public: // OF_DISALLOW_COPY_AND_MOVE(Shape); + using DimVector::DimVector; Shape() : is_initialized_(false) {} explicit Shape(const DimVector& dim_vec); explicit Shape(DimVector&& dim_vec); explicit Shape(const ShapeProto& shape_proto); - Shape(const std::initializer_list& dim_vec); + // explicit constructor from ShapeView + explicit Shape(ShapeView shape_view); ~Shape() = default; - Shape& operator=(const Shape& shape); - Shape& assign(const DimVector& dim_vec); + +#define OVERRIDE_ADD_DATA_FUNC(func) \ + template \ + void func(Args... args) { \ + DimVector::func(std::forward(args)...); \ + is_initialized_ = true; \ + } + + OVERRIDE_ADD_DATA_FUNC(assign) + OVERRIDE_ADD_DATA_FUNC(push_back) + OVERRIDE_ADD_DATA_FUNC(emplace_back) + OVERRIDE_ADD_DATA_FUNC(append) + OVERRIDE_ADD_DATA_FUNC(insert) + OVERRIDE_ADD_DATA_FUNC(resize) + +#undef OVERRIDE_ADD_DATA_FUNC + Shape& CheckNumAxesIdenticalAndAssign(const ShapeView& shape_view); Shape& LeftOnesExtendedAssign(const ShapeView& shape_view); - bool operator==(const Shape& rhs) const; - bool operator!=(const Shape& rhs) const { return !(*this == rhs); } std::string DebugStr() const; std::string ToString() const; @@ -58,16 +73,16 @@ class Shape final { // Getters and Setters bool is_initialized() const { return is_initialized_; } - const DimVector& dim_vec() const { return dim_vec_; } - DimVector& dim_vec() { return dim_vec_; } + const DimVector& dim_vec() const { return *this; } + DimVector& dim_vec() { return *this; } int64_t elem_cnt() const { - return std::accumulate(dim_vec_.begin(), dim_vec_.end(), int64_t(1), std::multiplies<>()); + return std::accumulate(begin(), end(), int64_t(1), std::multiplies<>()); } int64_t At(int64_t index) const; void Set(int64_t index, int64_t val); int64_t NumAxes() const { CHECK(is_initialized()); - return dim_vec_.size(); + return size(); } int64_t Count(int64_t begin_axis, int64_t end_axis) const; int64_t Count(int64_t begin_axis) const; @@ -82,13 +97,14 @@ class Shape final { Maybe Slice(int64_t start_dim, int64_t end_dim) const; - ShapeView ToShapeView() const { return ShapeView(dim_vec_.data(), dim_vec_.size()); } + ShapeView ToShapeView() const { return ShapeView(data(), size()); } - MutShapeView ToMutShapeView() { return MutShapeView(dim_vec_.data(), dim_vec_.size()); } + MutShapeView ToMutShapeView() { return MutShapeView(data(), size()); } private: - DimVector dim_vec_; - bool is_initialized_; + // Set default value here because some constructors are inherited from DimVector + // TODO(daquexian): remove this field and make it initializied by construction + bool is_initialized_ = true; }; int64_t ShiftNegativeAxis(int64_t axis, const int64_t num_axes); @@ -99,7 +115,7 @@ Shape ZeroDimCompatiableShape(const Shape& shape); Shape CreateReducedShapeOrOnesShape(const ShapeView& shape, const AxisVector& axis_vec); template void Shape::SerializeWithTextFormat(StreamT& out_stream) const { - for (int64_t dim : dim_vec_) { out_stream << std::to_string(dim) << ' '; } + for (int64_t dim : *this) { out_stream << std::to_string(dim) << ' '; } } std::ostream& operator<<(std::ostream& out, const Shape& shape); diff --git a/oneflow/core/common/shape_vec.h b/oneflow/core/common/shape_vec.h index c97870a4832..18a34ed7741 100644 --- a/oneflow/core/common/shape_vec.h +++ b/oneflow/core/common/shape_vec.h @@ -16,24 +16,15 @@ limitations under the License. #ifndef ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ #define ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" namespace oneflow { -//#define DISABLE_FIXED_SHAPE_VEC #define SHAPE_MAX_AXIS_SIZE 20 -#if defined(DISABLE_FIXED_SHAPE_VEC) +typedef small_vector DimVector; +typedef small_vector AxisVector; -typedef std::vector DimVector; -typedef std::vector AxisVector; - -#else - -typedef fixed_vector DimVector; -typedef fixed_vector AxisVector; - -#endif } // namespace oneflow #endif // ONEFLOW_CORE_COMMON_SHAPE_VEC_H_ diff --git a/oneflow/core/common/shape_view.cpp b/oneflow/core/common/shape_view.cpp index cd0bb3d1370..648034665fe 100644 --- a/oneflow/core/common/shape_view.cpp +++ b/oneflow/core/common/shape_view.cpp @@ -77,7 +77,7 @@ template void ShapeViewBase::ToShape(Shape* shape) const { DimVector dim_vec; this->ToDimVector(&dim_vec); - shape->assign(dim_vec); + *shape = Shape(dim_vec); } template class ShapeViewBase; diff --git a/oneflow/core/common/small_vector.h b/oneflow/core/common/small_vector.h new file mode 100644 index 00000000000..6aee5359f2b --- /dev/null +++ b/oneflow/core/common/small_vector.h @@ -0,0 +1,53 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ +#define ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ + +#include "llvm/ADT/SmallVector.h" + +namespace oneflow { + +template +class small_vector : public llvm::SmallVector { + using Base = llvm::SmallVector; + + public: + // https://stackoverflow.com/questions/27954940/a-using-statement-compiles-with-g-fails-compilation-with-clang + using Base::Base; + + typename Base::reference at(typename Base::size_type idx) { + CHECK_LT(idx, Base::size()); + return (*this)[idx]; + } + typename Base::const_reference at(typename Base::size_type idx) const { + CHECK_LT(idx, Base::size()); + return (*this)[idx]; + } + typename Base::const_iterator cbegin() const { + return (typename Base::const_iterator)this->BeginX; + } + typename Base::const_iterator cend() const { + return (typename Base::const_iterator)(this->BeginX) + Base::size(); + } + typename Base::const_iterator cbegin() { return (typename Base::const_iterator)this->BeginX; } + typename Base::const_iterator cend() { + return (typename Base::const_iterator)(this->BeginX) + Base::size(); + } +}; + +} // namespace oneflow + +#endif // ONEFLOW_CORE_COMMON_SMALL_VECTOR_H_ diff --git a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp index d119c3eeaf3..a8be6b054ed 100644 --- a/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp +++ b/oneflow/core/ep/cpu/primitive/broadcast_elementwise_binary.cpp @@ -72,9 +72,9 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, size_t num_src1_dims, const int64_t* src1_dims, const void* src1, void* dst) override { - DimVector src0_dim_vec; - DimVector src1_dim_vec; - DimVector dst_dim_vec; + Shape src0_shape; + Shape src1_shape; + Shape dst_shape; size_t num_dims = 0; int64_t simplified_src0_dims[kMaxNumDims]; int64_t simplified_src1_dims[kMaxNumDims]; @@ -85,15 +85,13 @@ class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { CheckInplace(num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1, simplified_dst_dims, dst); for (int64_t i = 0; i < num_dims; ++i) { - src0_dim_vec.push_back(simplified_src0_dims[i]); - src1_dim_vec.push_back(simplified_src1_dims[i]); - dst_dim_vec.push_back(simplified_dst_dims[i]); + src0_shape.push_back(simplified_src0_dims[i]); + src1_shape.push_back(simplified_src1_dims[i]); + dst_shape.push_back(simplified_dst_dims[i]); } - binary_func( - stream, XpuVarNdarray(Shape(dst_dim_vec), reinterpret_cast(dst), num_dims), - XpuVarNdarray(Shape(src0_dim_vec), reinterpret_cast(src0), num_dims), - XpuVarNdarray(Shape(src1_dim_vec), reinterpret_cast(src1), - num_dims)); + binary_func(stream, XpuVarNdarray(dst_shape, reinterpret_cast(dst), num_dims), + XpuVarNdarray(src0_shape, reinterpret_cast(src0), num_dims), + XpuVarNdarray(src1_shape, reinterpret_cast(src1), num_dims)); } }; diff --git a/oneflow/core/framework/tensor_methods.cpp b/oneflow/core/framework/tensor_methods.cpp index 7ed41a652f8..6f6cf271660 100644 --- a/oneflow/core/framework/tensor_methods.cpp +++ b/oneflow/core/framework/tensor_methods.cpp @@ -188,7 +188,7 @@ Maybe Unsqueeze(const std::shared_ptr& input, const int32_t& exp cnt++; } target_dim_vec[expand_dim] = 1; - target_stride_vec[expand_dim] = strides->At(expand_dim); + target_stride_vec[expand_dim] = expand_dim < ndim ? strides->At(expand_dim) : 1; } int64_t storage_offset = JUST(JUST(input->AsMirroredTensor())->storage_offset()); diff --git a/oneflow/core/job/eager_nccl_comm_manager.cpp b/oneflow/core/job/eager_nccl_comm_manager.cpp index d8b77cdbb72..959a7837010 100644 --- a/oneflow/core/job/eager_nccl_comm_manager.cpp +++ b/oneflow/core/job/eager_nccl_comm_manager.cpp @@ -71,6 +71,9 @@ void CreateNcclComm(ncclComm_t* comm, const int dev, const std::string& key, << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank << ", key = {" << key << "}\n"; OF_NCCL_CHECK(ncclCommInitRank(comm, device_vec.size(), nccl_unique_id, rank)); + VLOG(2) << " EagerNcclCommMgr::ncclCommInitRank succeed device_vec.size() = " << device_vec.size() + << ", nccl_unique_id = " << NcclUniqueId2String(nccl_unique_id) << ", rank = " << rank + << ", key = {" << key << "}\n"; } } // namespace diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 5afc24192da..8ae659fd541 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -997,13 +997,19 @@ Maybe LazyJobBuildAndInferCtx::Complete() { } }; int32_t pass_cnt = 0; + const int64_t prev_v = FLAGS_v; auto DoPass = [&](const std::string& pass_name, int32_t cnt = 0) -> Maybe { + VLOG(1) << job_name << " is compiling with pass" + << " pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + << (cnt > 0 ? std::to_string(cnt) : ""); if (unlikely(NeedLogJob(pass_name))) { std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-before"); + FLAGS_v = 3; } JUST(JobPass4Name(pass_name)(mut_job(), &job_pass_ctx)); if (unlikely(NeedLogJob(pass_name))) { + FLAGS_v = prev_v; std::string cnt_str = cnt > 0 ? std::to_string(cnt) : ""; LogJob("pass_cnt_" + std::to_string(pass_cnt) + "-" + pass_name + cnt_str + "-after"); } diff --git a/oneflow/core/job/job_builder.cpp b/oneflow/core/job/job_builder.cpp index a7f81384376..b13bd8a67fd 100644 --- a/oneflow/core/job/job_builder.cpp +++ b/oneflow/core/job/job_builder.cpp @@ -18,6 +18,7 @@ limitations under the License. #include "oneflow/core/common/util.h" #include "oneflow/core/common/container_util.h" #include "oneflow/core/job/job.pb.h" +#include "oneflow/core/job/sbp_parallel.pb.h" #include "oneflow/core/operator/operator.h" namespace oneflow { diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto index 69aa7ad29f0..03638feec30 100644 --- a/oneflow/core/job/job_conf.proto +++ b/oneflow/core/job/job_conf.proto @@ -211,6 +211,7 @@ message JobConfigProto { optional bool enable_gradients_stats_aggregation = 106 [default = true]; optional string optimizer_placement_optimization_mode = 107; optional int64 optimizer_placement_optimization_threshold = 108 [default = 1024]; + optional int64 optimizer_placement_optimization_shard_restore_level = 110 [default = 2]; optional QatConfig qat_config = 109; diff --git a/oneflow/core/job/job_ir.cpp b/oneflow/core/job/job_ir.cpp index 792735a0354..f5552b92514 100644 --- a/oneflow/core/job/job_ir.cpp +++ b/oneflow/core/job/job_ir.cpp @@ -19,6 +19,10 @@ namespace oneflow { #ifndef WITH_MLIR +Maybe ConvertJobToTosaIR(Job* job) { + UNIMPLEMENTED_THEN_RETURN() << "ConvertJobToTosaIR is only supported WITH_MLIR"; +} + Maybe SaveJobToIR(Job* job, const std::string& path) { UNIMPLEMENTED_THEN_RETURN() << "SaveJobToIR is only supported WITH_MLIR"; } diff --git a/oneflow/core/job/job_ir.h b/oneflow/core/job/job_ir.h index c57d0eebeb8..7dbd8da0c31 100644 --- a/oneflow/core/job/job_ir.h +++ b/oneflow/core/job/job_ir.h @@ -21,6 +21,7 @@ limitations under the License. namespace oneflow { +Maybe ConvertJobToTosaIR(Job* job); Maybe SaveJobToIR(Job* job, const std::string& path); Maybe LoadJobFromIR(Job* job, const std::string& path); diff --git a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp index 15ca99eab73..d51c171df19 100644 --- a/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp +++ b/oneflow/core/job_rewriter/auto_mixed_precision_lists.cpp @@ -90,6 +90,7 @@ const AMPList& AutoMixedPrecisionLists::ClearList() { "identity", "flatten", "squeeze", + "embedding", "expand_dims", "cast_to_static_shape", "parallel_cast", diff --git a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp index 1ca857fd11f..7aaf2e75426 100644 --- a/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp +++ b/oneflow/core/job_rewriter/optimizer_placement_optimization_pass.cpp @@ -13,10 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "oneflow/core/common/util.h" +#include "oneflow/core/framework/nd_sbp.h" +#include "oneflow/core/framework/user_op_conf.h" +#include "oneflow/core/job/nd_sbp_util.h" +#include "oneflow/core/job/sbp_parallel.h" +#include "oneflow/core/job/sbp_parallel.pb.h" #include "oneflow/core/job_rewriter/job_pass.h" #include "oneflow/core/graph/op_graph.h" #include "oneflow/core/job/job_desc.h" +#include "oneflow/core/operator/op_conf.pb.h" +#include "oneflow/core/operator/operator.h" namespace oneflow { @@ -31,7 +40,7 @@ int64_t GetSoleOutBlobSize(const OpNode* node) { class DataParallelNodeSequence final { public: DataParallelNodeSequence(std::vector nodes, int64_t order) - : nodes_(std::move(nodes)), order_(order) { + : nodes_(std::move(nodes)), order_(order), len_(nodes_.size()) { const OpNode* var_node = nodes_.front(); CHECK(var_node->op().op_conf().has_variable_conf()); model_size_ = GetSoleOutBlobSize(var_node); @@ -50,13 +59,23 @@ class DataParallelNodeSequence final { int64_t model_size() const { return model_size_; } + int64_t len() const { return len_; } + + void resize(const int64_t size) { + CHECK(size <= len_); + CHECK(size > 1); + nodes_.resize(size); + len_ = nodes().size(); + } + private: std::vector nodes_; int64_t order_; int64_t model_size_; + int64_t len_; }; -using SequencePtr = std::shared_ptr; +using SequencePtr = std::shared_ptr; ParallelConf NonDistributedParallelConf4ParallelId(const ParallelDesc& pd, const int64_t parallel_id) { @@ -76,7 +95,6 @@ Maybe GetDataParallelVariableAndNaiveSuccNode( // Find sequence like: vairable -> cast_fp32_to_fp16 if (!start->op().op_conf().has_variable_conf()) { return Maybe::Ok(); } const ParallelDesc& pd = start->parallel_desc(); - if (pd.device_type() != DeviceType::kCUDA) { return Maybe::Ok(); } if (pd.parallel_num() == 1) { return Maybe::Ok(); } const OpNode* cur_node = start; while (cur_node != nullptr) { @@ -85,12 +103,21 @@ Maybe GetDataParallelVariableAndNaiveSuccNode( if (cur_node->in_edges().size() > 1) { break; } if (cur_node->op().input_bns().size() != 1) { break; } const std::string& sole_ibn = cur_node->op().SoleIbn(); - if (!cur_node->SbpParallel4BnInOp(sole_ibn).has_broadcast_parallel()) { break; } + const NdSbp& ibn_nd_sbp = cur_node->NdSbp4BnInOp(sole_ibn); + bool has_broadcast = false; + FOR_RANGE(int, i, 0, ibn_nd_sbp.sbp_parallel_size()) { + if (ibn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; }; + } + if (!has_broadcast) { break; } } - if (!IsAllowed(cur_node)) { break; } if (cur_node->op().output_bns().size() != 1) { break; } const std::string& sole_obn = cur_node->op().SoleObn(); - if (!cur_node->SbpParallel4BnInOp(sole_obn).has_broadcast_parallel()) { break; } + const NdSbp& obn_nd_sbp = cur_node->NdSbp4BnInOp(sole_obn); + bool has_broadcast = false; + FOR_RANGE(int, i, 0, obn_nd_sbp.sbp_parallel_size()) { + if (obn_nd_sbp.sbp_parallel(i).has_broadcast_parallel()) { has_broadcast = true; }; + } + if (!has_broadcast) { break; } out->emplace_back(cur_node); if (cur_node->out_edges().size() == 1) { cur_node = cur_node->SoleOutEdge()->dst_node(); @@ -123,6 +150,79 @@ void SetBroadcastParallel4Consumers(JobBuilder* builder, const SequencePtr& sequ }); } +void SetNdSbp4OpNodeIbn(JobBuilder* builder, const OpNode* node, const std::string& ibn, + const NdSbp& nd_sbp) { + OpBlobArg op_blob_arg; + op_blob_arg.set_op_name(node->op().op_name()); + op_blob_arg.set_bn_in_op(ibn); + builder->SetNdSbp4Oba(op_blob_arg, nd_sbp); +} + +void SetNdSbp4Consumers(JobBuilder* builder, const SequencePtr& sequence, const NdSbp& nd_sbp) { + const OpNode* node = sequence->GetLastNode(); + const LogicalBlobId& lbi = node->op().BnInOp2Lbi(node->op().SoleObn()); + const int64_t shard_restore_level = + builder->job().job_conf().optimizer_placement_optimization_shard_restore_level(); + // If shard_restore_level == 0, no limit on consumer + if (shard_restore_level == 1) { + // Input lbn for parallel cast op + std::string parallel_cast_input_lbn = GenLogicalBlobName(lbi); + // Add indentity to enable mem reuse of boxing op when there is no op between var op and boxing. + if (sequence->len() == 1) { + VLOG(3) << "ZeRO find a data-parallel sequence only has one variable " + << sequence->GetVariableNode()->op().op_name(); + const auto var_identity_op = + user_op::UserOpConfWrapperBuilder("System-ZeRO-Identity-" + node->op().op_name() + "-" + + NewUniqueId()) + .Op("identity") + .Input("in", GenLogicalBlobName(lbi)) + .Output("out") + .ScopeSymbolId(node->op().op_conf().scope_symbol_id()) + .Build(); + builder->AddOps(node->parallel_desc().parallel_conf(), {var_identity_op.op_conf()}); + parallel_cast_input_lbn = var_identity_op.output("out", 0); + } + // Add parallel cast op to make soft limt on consumer to consume weight with Broadcast SBP. + const auto parallel_cast_op = + user_op::UserOpConfWrapperBuilder("System-ZeRO-ParallelCast-" + node->op().op_name() + "-" + + NewUniqueId()) + .Op("hierarchical_parallel_cast") + .Input("in", parallel_cast_input_lbn) + .Output("out") + .Attr>("nd_sbp", NdSbpToStringList(nd_sbp)) + .Attr("grad_mode", "identity") // don't do ndsbp cast at backward + .Attr>("grad_nd_sbp", std::vector()) + .ScopeSymbolId(node->op().op_conf().scope_symbol_id()) + .Build(); + builder->AddOps(node->parallel_desc().parallel_conf(), {parallel_cast_op.op_conf()}); + + // Make consumers to consume parallel cast op + auto out_lbn = parallel_cast_op.output("out", 0); + node->ForEachNodeOnOutEdge([&](const OpNode* out_node) { + for (const std::string& ibn : out_node->op().input_bns()) { + if (out_node->op().BnInOp2Lbi(ibn) == lbi) { + if (!CHECK_JUST(builder->IsInMutOpTransaction(out_node->op().op_name()))) { + CHECK_JUST(builder->MutOpTransactionMut(out_node->op().op_conf())); + } + OperatorConf& mut_consumer_op = + CHECK_JUST(builder->MutOpTransactionGet(out_node->op().op_name())); + const auto& old_lbn = ReplaceInputLbnInOpCustomizedConf(&mut_consumer_op, ibn, out_lbn); + CHECK_EQ(old_lbn, GenLogicalBlobName(lbi)); + } + } + }); + } else if (shard_restore_level == 2) { + // Hard limt consumer to consume weight as Broadcast. + node->ForEachNodeOnOutEdge([&](const OpNode* out_node) { + for (const std::string& ibn : out_node->op().input_bns()) { + if (out_node->op().BnInOp2Lbi(ibn) == lbi) { + SetNdSbp4OpNodeIbn(builder, out_node, ibn, nd_sbp); + } + } + }); + } +} + std::function MakeGetterOpNode2TopoOrder(const OpGraph& op_graph) { HashMap op_node2topo_order; int64_t node_cnt = 0; @@ -152,7 +252,7 @@ void ForEachDataParallelNodeSequence(const OpGraph& op_graph, CHECK_JUST(GetDataParallelVariableAndNaiveSuccNode(node, IsAllowed, &nodes)); if (nodes.empty()) { return; } const int64_t order = GetMinConsumerOrder(op_graph, nodes.back(), OpNode2Order); - Handler(std::make_shared(std::move(nodes), order)); + Handler(std::make_shared(std::move(nodes), order)); }); } @@ -188,6 +288,24 @@ bool IsS0Parallel(const SbpSignature& signature, const std::string& bn) { return IsS0Parallel(signature.bn_in_op2sbp_parallel().at(bn)); } +bool IsNdSbpMatch(const NdSbpSignature& signature, const std::string& bn, const NdSbp& nd_sbp) { + return signature.bn_in_op2nd_sbp().at(bn) == nd_sbp; +} + +bool IsNdSbpSupported4Op(const OpNode* node, const NdSbp& nd_sbp) { + if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; } + std::vector list; + auto LogicalBlobDesc4Ibn = [&](const std::string& bn) -> Maybe { + return Maybe(node->LogicalBlobDesc4Lbi(node->op().BnInOp2Lbi(bn))); + }; + CHECK_JUST(node->op().GetNdSbpSignatureList(LogicalBlobDesc4Ibn, node->parallel_desc(), &list)); + const auto IsInAndOutMatch = [&](const NdSbpSignature& signature) { + return IsNdSbpMatch(signature, node->op().SoleIbn(), nd_sbp) + && IsNdSbpMatch(signature, node->op().SoleObn(), nd_sbp); + }; + return std::any_of(list.cbegin(), list.cend(), IsInAndOutMatch); +} + bool IsS0SignatureSupported(const OpNode* node) { if (node->op().input_bns().size() != 1 || node->op().output_bns().size() != 1) { return false; } SbpSignatureList list; @@ -222,42 +340,141 @@ void ForEachModelSizeBalancedPartition( } } -Maybe RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) { - const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold(); - const auto IsAllowed = [threshold](const OpNode* n) -> bool { - if (n->op().op_conf().has_variable_conf()) { - const Shape shape(n->op().op_conf().variable_conf().shape()); - const int64_t parallel_num = n->parallel_desc().parallel_num(); - // Parameter needs to be able to evenly splited and one slice size >= threshold - return shape.At(0) % parallel_num == 0 && shape.elem_cnt() >= threshold * parallel_num; +namespace { +bool IsSplitValid(const Shape& shape, const NdSbp& nd_sbp, const Shape& hierachy, + int64_t min_size) { + if (shape.NumAxes() < 1 || shape.elem_cnt() < 1) { return false; } + CHECK_EQ(nd_sbp.sbp_parallel_size(), hierachy.NumAxes()); + Shape cur_shape = shape; + if (cur_shape.elem_cnt() < min_size) { return false; } + FOR_RANGE(int64_t, i, 0, hierachy.NumAxes()) { + const auto& sbp = nd_sbp.sbp_parallel(i); + if (sbp.has_split_parallel()) { + const int64_t dim = sbp.split_parallel().axis(); + if (dim >= cur_shape.NumAxes()) { return false; } + // Evenly split. + if (cur_shape.At(dim) % hierachy.At(i) != 0) { return false; } + cur_shape.Set(dim, cur_shape.At(dim) / hierachy.At(i)); + // Larger then min size. + if (cur_shape.elem_cnt() < min_size) { return false; } + } + } + return true; +} + +void GenerateSplitSignature(const NdSbp& var_nd_sbp, const OperatorConf& new_var_op_conf, + std::string& new_split_signature, int64_t& split_dim) { + if (new_var_op_conf.variable_conf().nd_sbp_size() > 0 && NdSbpIsAllBroadcast(var_nd_sbp)) { + // split last dim + split_dim = new_var_op_conf.variable_conf().nd_sbp_size() - 1; + // All B, B -> S0 + new_split_signature = "S(0)"; + } else { + // ND sbp, (*, B, S, *) -> (*, S, S, *) + // ND sbp, (*, S, B, *) -> (*, S, S, *) + FOR_RANGE(int64_t, j, 0, new_var_op_conf.variable_conf().nd_sbp_size()) { + if (new_var_op_conf.variable_conf().nd_sbp(j) == "B") { + std::vector adjacent_dim{j - 1, j + 1}; + for (auto const& dim_to_try : adjacent_dim) { + if (dim_to_try >= 0 && dim_to_try < new_var_op_conf.variable_conf().nd_sbp_size()) { + SbpParallel sbp; + if (ParseSbpParallelFromString(new_var_op_conf.variable_conf().nd_sbp(dim_to_try), &sbp) + && sbp.has_split_parallel()) { + new_split_signature = new_var_op_conf.variable_conf().nd_sbp(dim_to_try); + split_dim = j; + } + } + if (new_split_signature != "") break; + } + } + // Only split one more dim. + if (new_split_signature != "") break; + } + } +} +void ShardSequence(JobBuilder* builder, const int64_t threshold, const ParallelDesc& pd, + std::vector&& sorted_sequences) { + // For all sorted sequnence, set the variable op in the sequence to S + // and add ctrl edge to control the exectuion order between variable ops. + // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass + // consume the fp16 variable and the optimizer consume the fp32 variable. + std::string prev_allowed_op_name = ""; + for (int64_t i = 0; i < sorted_sequences.size(); ++i) { + const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode(); + OperatorConf new_var_op_conf = var_node->op().op_conf(); + const std::string& sole_obn = var_node->op().SoleObn(); + const NdSbp& var_nd_sbp = var_node->NdSbp4BnInOp(sole_obn); + const Shape& logical_shape = Shape(new_var_op_conf.variable_conf().shape()); + + std::string new_split_signature = ""; + int64_t split_dim = 0; + GenerateSplitSignature(var_nd_sbp, new_var_op_conf, new_split_signature, split_dim); + if (new_split_signature != "") { + *new_var_op_conf.mutable_variable_conf()->mutable_nd_sbp(split_dim) = new_split_signature; } else { - return IsS0SignatureSupported(n); + continue; + } + + bool split_is_allowed = true; + if (split_is_allowed) { + NdSbp new_nd_sbp; + std::vector nd_sbp_str_vec; + for (const auto& sbp_str : new_var_op_conf.variable_conf().nd_sbp()) { + nd_sbp_str_vec.push_back(sbp_str); + } + ParseNdSbpFromStringList(nd_sbp_str_vec, &new_nd_sbp); + // check allowed by min shard size and evenly split + if (split_is_allowed) { + split_is_allowed = IsSplitValid(logical_shape, new_nd_sbp, *pd.hierarchy(), threshold); + } + if (split_is_allowed) { + // resize sequence by new nd sbp limit + auto& cur_seq = sorted_sequences.at(i); + int64_t max_len = 1; + if (cur_seq->len() > 1) { + FOR_RANGE(int64_t, node_idx, 1, cur_seq->len()) { + if (IsNdSbpSupported4Op(cur_seq->nodes().at(node_idx), new_nd_sbp)) { + ++max_len; + } else { + break; + } + } + } + if (max_len < cur_seq->len()) { cur_seq->resize(max_len); } + } } + if (!split_is_allowed) { + VLOG(3) << var_node->op().op_name() << " failed to change form B to S " + << " with op conf " << new_var_op_conf.variable_conf().DebugString(); + continue; + } + if (i != 0) { new_var_op_conf.add_ctrl_in_op_name(prev_allowed_op_name); } + builder->MutOpsOnlyOnce({new_var_op_conf}); + // Set consumers to consum this variable op's cast op's output as Broadcast. + if (new_split_signature != "") { + SetNdSbp4Consumers(builder, sorted_sequences.at(i), var_nd_sbp); + } + prev_allowed_op_name = var_node->op().op_name(); + VLOG(3) << var_node->op().op_name() << " succeed to change form B to " << new_split_signature + << " on ranks dim " << split_dim << " with op conf " + << new_var_op_conf.variable_conf().DebugString(); + } +} +} // namespace + +Maybe RewriteDistributedSplit(const OpGraph& op_graph, JobBuilder* builder) { + const int64_t threshold = builder->job().job_conf().optimizer_placement_optimization_threshold(); + const auto IsAllowed = [](const OpNode* n) -> bool { + // No need to limit here. + return true; }; const auto PlacementSequencesAsSplitParallel = [&](const ParallelDesc& pd, std::vector&& sorted_sequences) { - // For all sorted sequnence, set the variable op in the sequence to S(0) - // and add ctrl edge to control the exectuion order between variable ops. - // A sequence is a variable op and its cast(fp32 to fp16) op. This is because the forward pass - // consume the fp16 variable and the optimizer consume the fp32 variable. - for (int64_t i = 0; i < sorted_sequences.size(); ++i) { - const OpNode* var_node = sorted_sequences.at(i)->GetVariableNode(); - OperatorConf new_var_op_conf = var_node->op().op_conf(); - CHECK_EQ(pd.hierarchy()->NumAxes(), 1); - new_var_op_conf.mutable_variable_conf()->clear_nd_sbp(); - *new_var_op_conf.mutable_variable_conf()->add_nd_sbp() = "S(0)"; - if (i != 0) { - const std::string& prev_op_name = - sorted_sequences.at(i - 1)->GetVariableNode()->op().op_name(); - new_var_op_conf.add_ctrl_in_op_name(prev_op_name); - } - builder->MutOpsOnlyOnce({new_var_op_conf}); - // Set consumers to consum this variable op's cast op's output as Broadcast. - SetBroadcastParallel4Consumers(builder, sorted_sequences.at(i)); - } + ShardSequence(builder, threshold, pd, std::forward>(sorted_sequences)); }; ForEachParallelSortedNodeSequence(op_graph, IsAllowed, SequenceCompSortedByOrderAsc, PlacementSequencesAsSplitParallel); + JUST(builder->MutOpTransactionCommit()); return Maybe::Ok(); } @@ -313,7 +530,8 @@ class OptimizerPlacementOptimizationPass final : public JobPass { Maybe Apply(Job* job, JobPassCtx* ctx) const override { if (!(ctx->job_desc().IsTrain() - && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode())) { + && ctx->job_desc().job_conf().has_optimizer_placement_optimization_mode() + && ctx->job_desc().job_conf().optimizer_placement_optimization_mode() != "none")) { return Maybe::Ok(); } const std::string& mode = ctx->job_desc().job_conf().optimizer_placement_optimization_mode(); diff --git a/oneflow/extension/python/numpy_internal.h b/oneflow/extension/python/numpy_internal.h index fd3e9594034..84590a38990 100644 --- a/oneflow/extension/python/numpy_internal.h +++ b/oneflow/extension/python/numpy_internal.h @@ -22,7 +22,7 @@ limitations under the License. // ************************ #include "oneflow/core/common/data_type.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/common/shape_vec.h" // PyArrayObject cannot be forward declared, or a compile error will occur diff --git a/oneflow/ir/install-llvm.cmake b/oneflow/ir/install-llvm.cmake index e7c09ba1aae..1b7226dccab 100644 --- a/oneflow/ir/install-llvm.cmake +++ b/oneflow/ir/install-llvm.cmake @@ -49,6 +49,8 @@ if(NOT llvm_monorepo_POPULATED) endif() endif() +set(LLVM_INCLUDE_DIRS ${llvm_monorepo_SOURCE_DIR}/llvm/include;${llvm_monorepo_BINARY_DIR}/include) + if(WITH_MLIR) set(LLVM_DIR ${LLVM_INSTALL_DIR}/lib/cmake/llvm) set(MLIR_DIR ${LLVM_INSTALL_DIR}/lib/cmake/mlir) diff --git a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp index 488198828ac..80836da5786 100644 --- a/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp +++ b/oneflow/ir/lib/OneFlow/Conversion/OneFlowToTosa.cpp @@ -14,21 +14,26 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "OneFlow/OneFlowOps.h" +#include #include #include #include "OneFlow/OneFlowDialect.h" #include "OneFlow/Passes.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Func/Transforms/Passes.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Pass/Pass.h" @@ -36,11 +41,46 @@ limitations under the License. #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" +#include "oneflow/core/framework/op_expr_grad_function.h" +#include "oneflow/core/framework/variable_tensor_mgr.h" + +#include namespace mlir { namespace oneflow { +Value CreateTranspose(Location& loc, ConversionPatternRewriter& rewriter, Value input, + ArrayRef perms) { + int perms_size = perms.size(); + auto transpose_perms = rewriter.create( + loc, RankedTensorType::get({perms_size}, rewriter.getI32Type()), + rewriter.getI32TensorAttr(perms)); + const auto shape_type = input.getType().cast(); + std::vector ranked_type; + for (const auto& index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return rewriter.create( + loc, RankedTensorType::get(ranked_type, shape_type.getElementType()), input, transpose_perms); +}; + +Value CreateBNOp(Location loc, ConversionPatternRewriter& rewriter, Value output, Value x, + Value mean, Value variance, Value epsilon, Value gamma, Value beta) { + const auto output_type = output.getType(); + // sub_op = sub(input, mean) + auto sub_op0 = rewriter.create(loc, output_type, x, mean); + // add_op0 = add(var, epsilon) + auto add_op0 = rewriter.create(loc, variance.getType(), variance, epsilon); + // rsqrt_op = rsqrt(add_op0) + auto rsqrt_op = rewriter.create(loc, variance.getType(), add_op0); + // op4 = mul(sub_op, rsqrt_op) + auto mul_op0 = rewriter.create(loc, output_type, sub_op0, rsqrt_op, 0); + // op5 = mul(mul_op0, gamma) + auto mul_op1 = rewriter.create(loc, output_type, mul_op0, gamma, 0); + // op6 = add(mul_op1, beta) + auto batch_norm = rewriter.create(loc, output_type, mul_op1, beta); + return batch_norm; +}; + struct ScalarMulByTensorOpLowering final : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -72,18 +112,421 @@ struct ScalarMulByTensorOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Job op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto func = + rewriter.create(op.getLoc(), op.getName(), op.getFunctionType()); + rewriter.inlineRegionBefore(op.getRegion(), func.getBody(), func.end()); + rewriter.eraseOp(op); + return success(); + } +}; + +struct ReturnOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(ReturnOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + rewriter.replaceOpWithNewOp(op, + /* operands */ op.operands()); + return success(); + } +}; + +struct InputOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(InputOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more choices to passing data between tosa and oneflow + const auto newValues = op.input(); + const auto is_block_arg = newValues.dyn_cast() != nullptr; + if (!is_block_arg) op->emitError("input is not block arg"); + rewriter.replaceOp(op, newValues); + return success(); + } +}; + +struct OutputOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(OutputOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more choices to passing data between tosa and oneflow + const auto newValues = op.input(); + rewriter.replaceOp(op, newValues); + return success(); + } +}; + +struct VariableOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(VariableOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto mgr = ::oneflow::Global<::oneflow::VariableTensorMgr>::Get(); + // decide whether call by python or not + if (!mgr) op->emitError("oneflow variable op doesn't support pure mlir file conversion"); + + const auto tensor = mgr->Get(op.op_name().str()); + const auto value = support::TensorToDenseElementsAttr(tensor, rewriter.getContext()); + const auto output = op.output().getType(); + + rewriter.replaceOpWithNewOp(op, output, value); + return success(); + } +}; + struct CastOpLowering final : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(CastOp op, OpAdaptor adaptor, ConversionPatternRewriter& rewriter) const override { - rewriter.replaceOpWithNewOp(op, - /* output */ op.out().getType(), - /* input */ op.in()); + auto output = op.out().getType(); + auto input = op.in(); + rewriter.replaceOpWithNewOp(op, output, input); + return success(); + } +}; + +struct ReluOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(ReluOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto floatMax = std::numeric_limits::max(); + const auto intMax = std::numeric_limits::max(); + + const auto output = op.y().getType(); + auto input = op.x(); + auto max_int = static_cast(intMax); + auto max_fp = static_cast<::llvm::APFloat>(floatMax); + + rewriter.replaceOpWithNewOp(op, output, input, max_int, max_fp); + return success(); + } +}; + +struct BroadcastAddOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(BroadcastAddOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto output = op.z().getType(); + auto input1 = op.x(); + auto input2 = op.y(); + + rewriter.replaceOpWithNewOp(op, output, input1, input2); + return success(); + } +}; + +struct Add2OpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Add2Op op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto output = op.out().getType(); + auto input1 = op.in0(); + auto input2 = op.in1(); + + rewriter.replaceOpWithNewOp(op, output, input1, input2); + return success(); + } +}; + +struct AvgPool2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(AvgPool2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + + auto stride_pairs = get_pair_int64_from_array(op.stride()); + auto pad_pairs = get_pair_int64_from_array(op.padding()); + auto kernel_pairs = get_pair_int64_from_array(op.kernel_size()); + + auto loc = op.getLoc(); + auto perms = {0, 2, 3, 1}; + + const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + + auto input = CreateTranspose(loc, rewriter, op.x(), perms); + auto output = reshape_type(op.y().getType().cast(), perms); + + auto avg_pool2d = rewriter.create(loc, output, input, kernel, stride, pad); + + auto out = CreateTranspose(loc, rewriter, avg_pool2d, {0, 3, 1, 2}); + rewriter.replaceOp(op, {out}); + return success(); + } +}; + +struct MaxPool2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(MaxPool2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + // TODO: support return indice + if (op.return_indices()) op->emitError("not support return indices now"); + auto stride_pairs = get_pair_int64_from_array(op.stride()); + auto kernel_pairs = get_pair_int64_from_array(op.kernel_size()); + auto pad_pairs = get_pair_int64_from_array(op.padding()); + + auto loc = op.getLoc(); + auto perms = {0, 2, 3, 1}; + + const auto kernel = rewriter.getI64ArrayAttr({kernel_pairs.first, kernel_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + + auto input = CreateTranspose(loc, rewriter, op.x(), perms); + auto output = reshape_type(op.y().getType().cast(), perms); + + auto max_pool2d = rewriter.create(loc, output, input, kernel, stride, pad); + + auto y = CreateTranspose(loc, rewriter, max_pool2d, {0, 3, 1, 2}); + + auto indice_output = op.indice().getType(); + auto value = DenseElementsAttr::get(indice_output, rewriter.getZeroAttr(rewriter.getI64Type())); + + auto indice = rewriter.create(loc, indice_output, value); + rewriter.replaceOp(op, {y, indice}); + return success(); + } +}; + +struct FlattenOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(FlattenOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + const auto start_dim = op.start_dim(); + const auto end_dim = op.end_dim(); + const auto in_type = op.in().getType(); + + const auto in_shape = in_type.cast(); + const auto rank = in_type.dyn_cast().getRank(); + + // calculate reshape_vec + std::vector reshape_vec; + for (auto dim = 0; dim < start_dim; ++dim) { reshape_vec.push_back(in_shape.getDimSize(dim)); } + auto last_dim = end_dim < 0 ? rank : end_dim + 1; + int flatten_size = 1; + for (auto dim = start_dim; dim < last_dim; ++dim) { flatten_size *= in_shape.getDimSize(dim); } + reshape_vec.push_back(flatten_size); + if (end_dim > 0) { + for (auto dim = end_dim + 1; dim < rank; ++dim) { + reshape_vec.push_back(in_shape.getDimSize(dim)); + } + } + // generate reshape op + const auto output = RankedTensorType::get(reshape_vec, in_shape.getElementType()); + auto input1 = op.in(); + auto new_shape = rewriter.getI64ArrayAttr(reshape_vec); + + rewriter.replaceOpWithNewOp(op, output, input1, new_shape); return success(); } }; +struct MatmulOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(MatmulOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + // TODO: more throw for robust in matmul shape rank + auto loc = op.getLoc(); + + auto preprocess = [&](Value matrix, bool transpose) -> Value { + auto shape_type = matrix.getType().cast(); + if (transpose) { matrix = CreateTranspose(loc, rewriter, matrix, {1, 0}); } + + shape_type = matrix.getType().cast(); + auto reshape_type = RankedTensorType::get( + {1, shape_type.getDimSize(0), shape_type.getDimSize(1)}, shape_type.getElementType()); + + return rewriter.create( + op.getLoc(), reshape_type, matrix, + rewriter.getI64ArrayAttr({1, shape_type.getDimSize(0), shape_type.getDimSize(1)})); + }; + + auto a = preprocess(op.a(), op.transpose_a()); + auto b = preprocess(op.b(), op.transpose_b()); + + const auto out_shape_type = op.out().getType().cast(); + const auto out_reshape_type = + RankedTensorType::get({1, out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)}, + out_shape_type.getElementType()); + + auto matmul = rewriter.create(loc, out_reshape_type, a, b); + const auto new_shape = + rewriter.getI64ArrayAttr({out_shape_type.getDimSize(0), out_shape_type.getDimSize(1)}); + + rewriter.replaceOpWithNewOp(op, out_shape_type, matmul, new_shape); + return success(); + } +}; + +struct NormalizationInferenceOpLowering final + : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(NormalizationInferenceOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto reshape_dim = [&](Type type, Value value) -> Value { + RankedTensorType in_type = value.getType().dyn_cast(); + RankedTensorType out_type = type.cast(); + SmallVector new_shape = {in_type.getShape()[0]}; + for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1); + auto new_type = RankedTensorType::get(new_shape, out_type.getElementType()); + return rewriter.create(op->getLoc(), new_type, value, + rewriter.getI64ArrayAttr(new_shape)); + }; + + auto loc = op->getLoc(); + const auto out_type = op.y().getType(); + + const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type()); + // epsilon = reshape(epsilon, shape_1) + auto epsilon = rewriter.create( + loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon())); + // mean = reshape(mean, shape_0) + auto mean = reshape_dim(out_type, adaptor.moving_mean()); + // variance= reshape(variance, shape_0) + auto variance = reshape_dim(out_type, adaptor.moving_variance()); + // scale = reshape(scale, shape_0) + auto gamma = reshape_dim(out_type, adaptor.gamma()); + // beta = reshape(beta, shape_0) + auto beta = reshape_dim(out_type, adaptor.beta()); + auto output = op.y(); + auto x = op.x(); + + auto batch_norm = + oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta); + rewriter.replaceOp(op, {batch_norm}); + return success(); + } +}; + +struct NormalizationOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(NormalizationOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto reshape_dim = [&](Type type, Value value) -> Value { + const RankedTensorType in_type = value.getType().dyn_cast(); + const RankedTensorType out_type = type.cast(); + SmallVector new_shape = {in_type.getShape()[0]}; + for (auto i = 2; i < out_type.getRank(); ++i) new_shape.push_back(1); + const auto new_type = RankedTensorType::get(new_shape, out_type.getElementType()); + return rewriter.create(op->getLoc(), new_type, value, + rewriter.getI64ArrayAttr(new_shape)); + }; + + auto loc = op->getLoc(); + const auto out_type = op.y().getType(); + + const auto epsilon_type = RankedTensorType::get({}, rewriter.getF32Type()); + // epsilon = reshape(epsilon, shape_1) + auto epsilon = rewriter.create( + loc, epsilon_type, DenseElementsAttr::get(epsilon_type, op.epsilon())); + // mean = reshape(mean, shape_0) + auto mean = reshape_dim(out_type, adaptor.moving_mean()); + // variance= reshape(variance, shape_0) + auto variance = reshape_dim(out_type, adaptor.moving_variance()); + // scale = reshape(scale, shape_0) + auto gamma = reshape_dim(out_type, adaptor.gamma()); + // beta = reshape(beta, shape_0) + auto beta = reshape_dim(out_type, adaptor.beta()); + auto output = op.y(); + auto x = op.x(); + + auto batch_norm = + oneflow::CreateBNOp(loc, rewriter, output, x, mean, variance, epsilon, gamma, beta); + auto moving_mean = op.moving_mean(); + auto moving_variance = op.moving_variance(); + + rewriter.replaceOp(op, {batch_norm, moving_mean, moving_variance}); + return success(); + } +}; + +struct Conv2DOpLowering final : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite(Conv2DOp op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + auto get_pair_int64_from_array = [](ArrayAttr arr) -> std::pair { + return {arr.getValue()[0].cast().getSInt(), + arr.getValue()[1].cast().getSInt()}; + }; + auto reshape_type = [](ShapedType shape_type, ArrayRef perms) -> RankedTensorType { + std::vector ranked_type; + for (auto index : perms) ranked_type.push_back(shape_type.getDimSize(index)); + return RankedTensorType::get(ranked_type, shape_type.getElementType()); + }; + + auto stride_pairs = get_pair_int64_from_array(op.strides()); + auto pad_pairs = get_pair_int64_from_array(op.padding_beforeAttr()); + auto dilation_pairs = get_pair_int64_from_array(op.dilation_rate()); + + const auto pad = rewriter.getI64ArrayAttr( + {pad_pairs.first, pad_pairs.second, pad_pairs.first, pad_pairs.second}); + const auto stride = rewriter.getI64ArrayAttr({stride_pairs.first, stride_pairs.second}); + const auto dilation = rewriter.getI64ArrayAttr({dilation_pairs.first, dilation_pairs.second}); + + auto bias = op.bias(); + auto loc = op.getLoc(); + if (!bias) { + const auto output_shape = op.out().getType().cast(); + const auto output_channels = output_shape.getDimSize(1); + const auto bias_elem_type = output_shape.getElementType(); + const auto type = RankedTensorType::get(output_channels, bias_elem_type); + bias = rewriter.create( + op.getLoc(), type, DenseElementsAttr::get(type, rewriter.getZeroAttr(bias_elem_type))); + } + + auto perms = {0, 2, 3, 1}; + auto in = CreateTranspose(loc, rewriter, op.in(), perms); + auto weight = CreateTranspose(loc, rewriter, op.weight(), perms); + const auto output = reshape_type(op.out().getType().cast(), perms); + + auto conv2d = + rewriter.create(loc, output, in, weight, bias, pad, stride, dilation); + + auto res = CreateTranspose(loc, rewriter, conv2d, {0, 3, 1, 2}); + rewriter.replaceOp(op, {res}); + return success(); + getTypeConverter(); + } +}; + namespace { struct OneFlowLoweringToTosaPass : public LowerOneFlowToTosaPassBase { void runOnOperation() override; @@ -95,11 +538,20 @@ std::unique_ptr createLowerOneFlowToTosaPass() { } void OneFlowLoweringToTosaPass::runOnOperation() { - ConversionTarget target(getContext()); - target.addLegalDialect(); + MLIRContext* context = &getContext(); + ConversionTarget target(*context); + target.addLegalDialect(); target.addIllegalDialect(); - RewritePatternSet patterns(&getContext()); - patterns.insert(&getContext()); + + TypeConverter typeConverter; + typeConverter.addConversion([](Type type) { return type; }); + RewritePatternSet patterns(context); + patterns.add(typeConverter, context); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) { getOperation()->dump(); signalPassFailure(); diff --git a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp index d4093babaf9..6a4e3bb380b 100644 --- a/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp +++ b/oneflow/ir/lib/OneFlow/OneFlowSupport.cpp @@ -69,9 +69,9 @@ ::oneflow::Symbol<::oneflow::Device> MakeDevice(const mlir::Attribute& device_ta return ::oneflow::Device::ParseAndNew(device_info).GetOrThrow(); } -template +template mlir::DenseElementsAttr __TensorToDenseElementsAttr( - const std::shared_ptr<::oneflow::one::Tensor>& tensor, const mlir::FloatType& float_type) { + const std::shared_ptr<::oneflow::one::Tensor>& tensor, const MLIR_T& mlir_type) { ::oneflow::LazyMode::Guard guard{false}; const auto tensor_ = ::oneflow::one::functional::ToContiguous(tensor).GetPtrOrThrow(); auto shape = tensor_->shape(); @@ -81,7 +81,7 @@ mlir::DenseElementsAttr __TensorToDenseElementsAttr( CHECK_JUST(::oneflow::BlobBufferCopyUtil::To(ofblob_ptr, data.data(), data.size())); }; ::oneflow::one::SyncAccessTensorWithTimeOut(tensor_, callback, "const").GetOrThrow(); - return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, float_type), + return mlir::DenseElementsAttr::get(mlir::RankedTensorType::get(shape_vec, mlir_type), llvm::makeArrayRef(data)); } @@ -115,7 +115,12 @@ mlir::DenseElementsAttr TensorToDenseElementsAttr( const std::shared_ptr<::oneflow::one::Tensor>& tensor, MLIRContext* ctx) { const auto dtype = tensor->dtype()->data_type(); if (dtype == ::oneflow::DataType::kFloat) { - return __TensorToDenseElementsAttr(tensor, mlir::FloatType::getF32(ctx)); + return __TensorToDenseElementsAttr(tensor, + mlir::FloatType::getF32(ctx)); + } else if (dtype == ::oneflow::DataType::kInt64) { + auto mlir_type = mlir::IntegerType::IntegerType::get( + ctx, 64, mlir::IntegerType::SignednessSemantics::Signed); + return __TensorToDenseElementsAttr(tensor, mlir_type); } llvm::errs() << "Converting oneflow::Tensor to mlir::DenseElementsAttr only support float32 now." << "\n"; @@ -132,8 +137,9 @@ std::shared_ptr<::oneflow::one::Tensor> DenseElementsAttrToTensor( return __DenseElementsAttrToTensor(dense_attr_, device_tag_attr, device_name_attr, ::oneflow::DataType::kFloat); } - llvm::errs() << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 now." - << "\n"; + llvm::errs() + << "Converting mlir::DenseElementsAttr to oneflow::Tensor only support float32 and int64 now." + << "\n"; exit(EXIT_FAILURE); } diff --git a/oneflow/ir/oneflow-extension/ir_pass.cpp b/oneflow/ir/oneflow-extension/ir_pass.cpp index e339b95f077..038c9578f55 100644 --- a/oneflow/ir/oneflow-extension/ir_pass.cpp +++ b/oneflow/ir/oneflow-extension/ir_pass.cpp @@ -177,6 +177,11 @@ Maybe IRRoundTrip::Apply(Job* job, JobPassCtx* ctx) const { template class IRRoundTrip; template class IRRoundTrip; +Maybe ConvertJobToTosaIR(Job* job) { + RoundTripOneFlowJobWrapper job_wrapper(job); + return ::mlir::oneflow::ConvertJobToTosaIR(job_wrapper); +} + Maybe SaveJobToIR(Job* job, const std::string& path) { // TODO: check path is valid dir if (IsInDebugMode()) { TeePersistentLogStream::Create("saved_job")->Write(*job); } diff --git a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h index 7d8baa77f68..b2afa4fbae1 100644 --- a/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h +++ b/oneflow/ir/oneflow-translate/include/OneFlow/MLIROneFlowTranslation.h @@ -150,6 +150,7 @@ void RoundTripOneFlowJob( void registerFromOneFlowJobTranslation(); +std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper); void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path); void LoadJobFromIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path); diff --git a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp index fb876048a99..9491b715593 100644 --- a/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp +++ b/oneflow/ir/oneflow-translate/lib/OneFlow/MLIROneFlowTranslation.cpp @@ -850,6 +850,35 @@ void RoundTripOneFlowJob( } } +std::string ConvertJobToTosaIR(RoundTripOneFlowJobWrapperInterface& job_wrapper) { + const ::oneflow::Job* job = job_wrapper.job(); + mlir::MLIRContext context; + context.getOrLoadDialect(); + context.loadDialect(); + + OwningOpRef module( + ModuleOp::create(FileLineColLoc::get(&context, "", /*line=*/0, /*column=*/0))); + JobImporter imp(job_wrapper, &context, module.get()); + if (succeeded(imp.ProcessJob())) { + mlir::PassManager pm(&context); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createLowerOneFlowToTosaPass()); + if (mlir::failed(pm.run(*module))) { + module->emitError("Failed to run oneflow-to-tosa pass"); + exit(EXIT_FAILURE); + } + + std::string mlir; + llvm::raw_string_ostream os_mlir(mlir); + module->print(os_mlir); + return mlir; + } else { + const auto& job_name = job->job_conf().job_name(); + llvm::errs() << "fail to convert job to IR, job_name: " << job_name << "\n"; + exit(EXIT_FAILURE); + } +} + void SaveJobToIR(RoundTripOneFlowJobWrapperInterface& job_wrapper, const std::string& path) { const ::oneflow::Job* job = job_wrapper.job(); mlir::MLIRContext context; diff --git a/oneflow/ir/test/Frontend/OneFlowToIree.mlir b/oneflow/ir/test/Frontend/OneFlowToIree.mlir new file mode 100644 index 00000000000..834063b7a71 --- /dev/null +++ b/oneflow/ir/test/Frontend/OneFlowToIree.mlir @@ -0,0 +1,266 @@ +// RUN: oneflow-opt %s \ +// RUN: -split-input-file \ +// RUN: -lower-oneflow-to-tosa \ +// RUN: -verify-diagnostics -o - \ +// RUN: | ireec \ +// RUN: --iree-input-type=tosa \ +// RUN: --iree-vm-bytecode-module-output-format=flatbuffer-binary \ +// RUN: -iree-hal-target-backends=dylib-llvm-aot \ +// RUN: -iree-mlir-to-vm-bytecode-module - + + +oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + oneflow.return %arg0 : tensor<1xf32> +} + + +oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.input"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.output"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +oneflow.job @test_variable() -> tensor<64x3x7x7xf32> +{ + %res = "oneflow.variable"() { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + nd_sbp = ["B"], + op_name = "fw.model.conv1.weight", + output_lbns = ["fw.model.conv1.weight/out"], + scope_symbol_id = 4611686018427432959 : i64, + shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64] + } : () -> tensor<64x3x7x7xf32> + oneflow.return %res : tensor<64x3x7x7xf32> +} + + +oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +{ + %res = "oneflow.add_n2"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + op_type_name = "add_n", + output_lbns = [""], + scope_symbol_id = 4611686018431205375 : i64 + } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> + oneflow.return %res: tensor<1x7x7xf32> +} + + +oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32> +{ + %res = "oneflow.broadcast_add"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64 + } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32> +{ + %y, %indice = "oneflow.max_pool_2d"(%arg0) + { + ceil_mode = false, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation = [1 : si32, 1 : si32], + hierarchy = [1], kernel_size = [3 : si32, 3 : si32], + op_name = "", + output_lbns = ["", ""], + padding = [1 : si32, 1 : si32], + return_indices = false, + scope_symbol_id = 4611686018427502591 : i64, + stride = [2 : si32, 2 : si32] + } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>) + oneflow.return %y : tensor<1x64x56x56xf32> +} + + +oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> +{ + %res = "oneflow.avg_pool_2d"(%arg0) + { + ceil_mode = false, + count_include_pad = true, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + divisor_override = 0 : si32, + hierarchy = [1], + kernel_size = [7 : si32, 7 : si32], + op_name = "model.avgpool-avg_pool_2d-172", + output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"], + padding = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018430775295 : i64, + stride = [7 : si32, 7 : si32] + } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> + oneflow.return %res: tensor<1x2048x1x1xf32> +} + + +oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> +{ + %res = "oneflow.conv2d"(%arg0, %arg1) + { + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation_rate = [1 : si32, 1 : si32], + filters = 512 : si32, + groups = 1 : si32, + hierarchy = [1], + kernel_size = [1 : si32, 1 : si32], + op_name = "", + operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>, + output_lbns = [""], + padding_before = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018431012863 : i64, + strides = [1 : si32, 1 : si32] + } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> + oneflow.return %res : tensor<1x5x224x224xf32> +} + + +oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +{ + %res = "oneflow.flatten"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + end_dim = 2 : si32, + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431217663 : i64, + start_dim = 1 : si32 + } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> + oneflow.return %res : tensor<4x6x1xf32> +} + + +oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32> +{ + %res = "oneflow.matmul"(%arg0, %arg1) + { + alpha = 1.000000e+00 : f64, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64, + transpose_a = false, + transpose_b = true + } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> { + %res = "oneflow.relu"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427424767 : i64 + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + +oneflow.job @test_bn( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + oneflow.return %y: tensor<1x64x112x112xf32> +} + +oneflow.job @test_bn_infer( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32> + oneflow.return %y: tensor<1x64x112x112xf32> +} diff --git a/oneflow/ir/test/Frontend/lit.local.cfg b/oneflow/ir/test/Frontend/lit.local.cfg new file mode 100644 index 00000000000..a63a9b31aa9 --- /dev/null +++ b/oneflow/ir/test/Frontend/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.WITH_ONEFLOW_IREE: + config.unsupported = True diff --git a/oneflow/ir/test/Frontend/test_iree_resnet.py b/oneflow/ir/test/Frontend/test_iree_resnet.py new file mode 100644 index 00000000000..7ac4e226965 --- /dev/null +++ b/oneflow/ir/test/Frontend/test_iree_resnet.py @@ -0,0 +1,113 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +# RUN: python3 %s + +from oneflow_iree.compiler import Runner +from flowvision.models import resnet50 +import oneflow as flow +import oneflow.unittest +import unittest +import os +import numpy as np +import time + + +os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1" + + +def _test_iree_resnet_cpu(test_case): + model = resnet50(pretrained=True) + model.eval() + + class GraphModule(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + func = Runner(GraphModule, return_numpy=True) + input = flow.ones([1, 3, 224, 224]) + f = GraphModule() + for iter in range(3): + print("======== in cpu iter" + str(iter + 1)) + iree_output = func(input) + start_time = time.time() + graph_output = f(input) + gap = time.time() - start_time + print("graph cost: " + str(gap)) + graph_output = graph_output.cpu().detach().numpy() + rtol = np.abs((graph_output - iree_output) / iree_output) + np.set_printoptions(threshold=np.inf) + print( + np.transpose( + np.concatenate((graph_output, iree_output, rtol), axis=0), [1, 0] + ) + ) + # the rtol accumulate layer by layer + test_case.assertTrue( + np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) + ) + + +def _test_iree_resnet_cuda(test_case): + model = resnet50(pretrained=True).cuda() + model.eval() + + class GraphModule(flow.nn.Graph): + def __init__(self): + super().__init__() + self.model = model + + def build(self, x): + return self.model(x) + + func = Runner(GraphModule, return_numpy=True).cuda() + input = flow.ones([1, 3, 224, 224]).cuda() + f = GraphModule() + for iter in range(3): + print("======== in cuda iter" + str(iter + 1)) + iree_output = func(input) + start_time = time.time() + graph_output = f(input) + gap = time.time() - start_time + print("graph cost: " + str(gap)) + graph_output = graph_output.cpu().detach().numpy() + rtol = np.abs((graph_output - iree_output) / iree_output) + np.set_printoptions(threshold=np.inf) + print( + np.transpose( + np.concatenate((graph_output, iree_output, rtol), axis=0), [1, 0] + ) + ) + # the rtol accumulate layer by layer + test_case.assertTrue( + np.allclose(iree_output, graph_output, rtol=1.0e-1, atol=1e-3) + ) + + +@flow.unittest.skip_unless_1n1d() +class TestIreeResnet(oneflow.unittest.TestCase): + def test_iree_resnet_cpu(test_case): + _test_iree_resnet_cpu(test_case) + + def test_iree_resnet_cuda(test_case): + _test_iree_resnet_cuda(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/oneflow/ir/test/Frontend/test_iree_runner.py b/oneflow/ir/test/Frontend/test_iree_runner.py new file mode 100644 index 00000000000..a0caa90fecd --- /dev/null +++ b/oneflow/ir/test/Frontend/test_iree_runner.py @@ -0,0 +1,71 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +# RUN: python3 %s + +from oneflow_iree.compiler import Runner +import oneflow as flow +import oneflow.unittest +import unittest +import numpy as np + + +class RELU(flow.nn.Module): + def __init__(self): + super().__init__() + self.relu = flow.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + +class GraphModule(flow.nn.Graph): + def __init__(self): + super().__init__() + self.fw = RELU() + + def build(self, x): + return self.fw(x) + + +def _test_check_iree_runner(test_case): + func = Runner(GraphModule, return_numpy=True).cuda() + # run on iree cuda backend + input = flow.Tensor([-1.0, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0])) + # change input shape + input = flow.Tensor([-1.0, 1.0, -1]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0, 0.0])) + # change on iree cpu backend + func = func.cpu() + input = flow.Tensor([-1.0, 0.0, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 0.0, 1.0])) + # change input shape + input = flow.Tensor([-1, 1.0]) + output = func(input) + test_case.assertTrue(np.allclose(output, [0.0, 1.0])) + + +@flow.unittest.skip_unless_1n1d() +class TestCheckIreeRunner(oneflow.unittest.TestCase): + def test_check_iree_runner(test_case): + _test_check_iree_runner(test_case) + + +if __name__ == "__main__": + unittest.main() diff --git a/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir new file mode 100644 index 00000000000..34ee5b499dc --- /dev/null +++ b/oneflow/ir/test/Frontend/test_tosa_to_elf.mlir @@ -0,0 +1,16 @@ +// RUN: oneflow-opt %s \ +// RUN: -pass-pipeline="func.func(tosa-to-linalg)" -cse \ +// RUN: --linalg-fuse-elementwise-ops -linalg-bufferize \ +// RUN: -tensor-bufferize -func-bufferize -buffer-results-to-out-params \ +// RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm \ +// RUN: -convert-func-to-llvm -convert-memref-to-llvm -reconcile-unrealized-casts --print-after-all \ +// RUN: | oneflow-translate -mlir-to-llvmir | clang -x ir - -c -o test.o + +builtin.module { + func.func @Graph_0(%arg0: tensor<2xf32>) -> tensor<2xf32> { + %0 = "tosa.cast"(%arg0) : (tensor<2xf32>) -> tensor<2xf32> + %1 = "tosa.reluN"(%0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<2xf32>) -> tensor<2xf32> + %2 = "tosa.cast"(%1) : (tensor<2xf32>) -> tensor<2xf32> + func.return %2 : tensor<2xf32> + } +} diff --git a/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir new file mode 100644 index 00000000000..3028b3c04bf --- /dev/null +++ b/oneflow/ir/test/OneFlow/conversion/OneFlowToTosa.mlir @@ -0,0 +1,342 @@ +// RUN: oneflow-opt %s \ +// RUN: -split-input-file \ +// RUN: -lower-oneflow-to-tosa \ +// RUN: -verify-diagnostics -o - \ +// RUN: | FileCheck %s + + +// CHECK-LABEL: test_func +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_func(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + oneflow.return %arg0 : tensor<1xf32> +} + + +// CHECK-LABEL: test_input +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_input(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.input"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +// CHECK-LABEL: test_output +// CHECK: return [[V0:%.+]] : tensor<1xf32> +oneflow.job @test_output(%arg0: tensor<1xf32>) -> tensor<1xf32> +{ + %res = "oneflow.output"(%arg0) + { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + is_dynamic = false, + nd_sbp = ["B"], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427412479 : i64, + shape = [1 : si64] + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + + +// CHECK-LABEL: test_variable +// CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<64x3x7x7xf32>} : () -> tensor<64x3x7x7xf32> +// CHECK: return [[V0]] : tensor<64x3x7x7xf32> +oneflow.job @test_variable() -> tensor<64x3x7x7xf32> +{ + %res = "oneflow.variable"() { + data_type = 2 : i32, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + nd_sbp = ["B"], + op_name = "fw.model.conv1.weight", + output_lbns = ["fw.model.conv1.weight/out"], + scope_symbol_id = 4611686018427432959 : i64, + shape = [64 : si64, 3 : si64, 7 : si64, 7 : si64] + } : () -> tensor<64x3x7x7xf32> + oneflow.return %res : tensor<64x3x7x7xf32> +} + + +//CHECK-LABEL: test_add_n2 +//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +//CHECK: return [[V0]] : tensor<1x7x7xf32> +oneflow.job @test_add_n2(%arg0: tensor<1x7x7xf32>, %arg1: tensor<1x7x7xf32>) -> tensor<1x7x7xf32> +{ + %res = "oneflow.add_n2"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + op_type_name = "add_n", + output_lbns = [""], + scope_symbol_id = 4611686018431205375 : i64 + } : (tensor<1x7x7xf32>, tensor<1x7x7xf32>) -> tensor<1x7x7xf32> + oneflow.return %res: tensor<1x7x7xf32> +} + + +//CHECK-LABEL: test_broadcast_add +//CHECK: [[V0:%.+]] = "tosa.add"(%arg0, %arg1) : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> +//CHECK: return [[V0]] : tensor<1x1000xf32> +oneflow.job @test_broadcast_add(%arg0: tensor<1x1000xf32>, %arg1: tensor<1000xf32>) -> tensor<1x1000xf32> +{ + %res = "oneflow.broadcast_add"(%arg0, %arg1) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64 + } : (tensor<1x1000xf32>, tensor<1000xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +//CHECK-LABEL: test_max_pool_2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x64x112x112xf32>, tensor<4xi32>) -> tensor<1x112x112x64xf32> +//CHECK: [[V2:%.+]] = "tosa.max_pool2d"([[V1]]) {kernel = [3, 3], pad = [1, 1, 1, 1], stride = [2, 2]} : (tensor<1x112x112x64xf32>) -> tensor<1x56x56x64xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x56x56x64xf32>, tensor<4xi32>) -> tensor<1x64x56x56xf32> +//CHECK: [[V5:%.+]] = "tosa.const"() {value = dense<0> : tensor<1x64x56x56xi64>} : () -> tensor<1x64x56x56xi64> +//CHECK: return [[V4]] : tensor<1x64x56x56xf32> +oneflow.job @test_max_pool_2d(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32> +{ + %y, %indice = "oneflow.max_pool_2d"(%arg0) + { + ceil_mode = false, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation = [1 : si32, 1 : si32], + hierarchy = [1], kernel_size = [3 : si32, 3 : si32], + op_name = "", + output_lbns = ["", ""], + padding = [1 : si32, 1 : si32], + return_indices = false, + scope_symbol_id = 4611686018427502591 : i64, + stride = [2 : si32, 2 : si32] + } : (tensor<1x64x112x112xf32>) -> (tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>) + oneflow.return %y : tensor<1x64x56x56xf32> +} + + +//CHECK-LABEL: test_avg_pool_2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V1:%.+]] = "tosa.transpose"(%arg0, [[V0]]) : (tensor<1x2048x7x7xf32>, tensor<4xi32>) -> tensor<1x7x7x2048xf32> +//CHECK: [[V2:%.+]] = "tosa.avg_pool2d"([[V1]]) {kernel = [7, 7], pad = [0, 0, 0, 0], stride = [7, 7]} : (tensor<1x7x7x2048xf32>) -> tensor<1x1x1x2048xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"([[V2]], [[V3]]) : (tensor<1x1x1x2048xf32>, tensor<4xi32>) -> tensor<1x2048x1x1xf32> +//CHECK: return [[V4]] : tensor<1x2048x1x1xf32> +oneflow.job @test_avg_pool_2d(%arg0: tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> +{ + %res = "oneflow.avg_pool_2d"(%arg0) + { + ceil_mode = false, + count_include_pad = true, + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + divisor_override = 0 : si32, + hierarchy = [1], + kernel_size = [7 : si32, 7 : si32], + op_name = "model.avgpool-avg_pool_2d-172", + output_lbns = ["model.avgpool-avg_pool_2d-172/y_0"], + padding = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018430775295 : i64, + stride = [7 : si32, 7 : si32] + } : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x1x1xf32> + oneflow.return %res: tensor<1x2048x1x1xf32> +} + + +//CHECK-LABEL: test_conv2d +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<5xf32>} : () -> tensor<5xf32> +//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg0, [[V1]]) : (tensor<1x3x224x224xf32>, tensor<4xi32>) -> tensor<1x224x224x3xf32> +//CHECK: [[V3:%.+]] = "tosa.const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V4:%.+]] = "tosa.transpose"(%arg1, [[V3]]) : (tensor<5x3x1x1xf32>, tensor<4xi32>) -> tensor<5x1x1x3xf32> +//CHECK: [[V5:%.+]] = "tosa.conv2d"([[V2]], [[V4]], [[V0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]} : (tensor<1x224x224x3xf32>, tensor<5x1x1x3xf32>, tensor<5xf32>) -> tensor<1x224x224x5xf32> +//CHECK: [[V6:%.+]] = "tosa.const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32> +//CHECK: [[V7:%.+]] = "tosa.transpose"([[V5]], [[V6]]) : (tensor<1x224x224x5xf32>, tensor<4xi32>) -> tensor<1x5x224x224xf32> +//CHECK: return [[V7]] : tensor<1x5x224x224xf32> +oneflow.job @test_conv2d(%arg0: tensor<1x3x224x224xf32>, %arg1: tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> +{ + %res = "oneflow.conv2d"(%arg0, %arg1) + { + data_format = "channels_first", + device_name = ["@0:0"], + device_tag = "cpu", + dilation_rate = [1 : si32, 1 : si32], + filters = 512 : si32, + groups = 1 : si32, + hierarchy = [1], + kernel_size = [1 : si32, 1 : si32], + op_name = "", + operand_segment_sizes = dense<[1, 1, 0, 0]> : vector<4xi32>, + output_lbns = [""], + padding_before = [0 : si32, 0 : si32], + scope_symbol_id = 4611686018431012863 : i64, + strides = [1 : si32, 1 : si32] + } : (tensor<1x3x224x224xf32>, tensor<5x3x1x1xf32>) -> tensor<1x5x224x224xf32> + oneflow.return %res : tensor<1x5x224x224xf32> +} + + +//CHECK-LABEL: test_flatten +//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [4, 6, 1]} : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +//CHECK: return [[V0]] : tensor<4x6x1xf32> +oneflow.job @test_flatten(%arg0: tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> +{ + %res = "oneflow.flatten"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + end_dim = 2 : si32, + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431217663 : i64, + start_dim = 1 : si32 + } : (tensor<4x3x2x1xf32>) -> tensor<4x6x1xf32> + oneflow.return %res : tensor<4x6x1xf32> +} + + +//CHECK-LABEL: test_matmul +//CHECK: [[V0:%.+]] = "tosa.reshape"(%arg0) {new_shape = [1, 1, 2048]} : (tensor<1x2048xf32>) -> tensor<1x1x2048xf32> +//CHECK: [[V1:%.+]] = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32> +//CHECK: [[V2:%.+]] = "tosa.transpose"(%arg1, [[V1]]) : (tensor<1000x2048xf32>, tensor<2xi32>) -> tensor<2048x1000xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"([[V2]]) {new_shape = [1, 2048, 1000]} : (tensor<2048x1000xf32>) -> tensor<1x2048x1000xf32> +//CHECK: [[V4:%.+]] = "tosa.matmul"([[V0]], [[V3]]) : (tensor<1x1x2048xf32>, tensor<1x2048x1000xf32>) -> tensor<1x1x1000xf32> +//CHECK: [[V5:%.+]] = "tosa.reshape"([[V4]]) {new_shape = [1, 1000]} : (tensor<1x1x1000xf32>) -> tensor<1x1000xf32> +//CHECK: return [[V5]] : tensor<1x1000xf32> +oneflow.job @test_matmul(%arg0: tensor<1x2048xf32>, %arg1: tensor<1000x2048xf32>) ->tensor<1x1000xf32> +{ + %res = "oneflow.matmul"(%arg0, %arg1) + { + alpha = 1.000000e+00 : f64, + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018431234047 : i64, + transpose_a = false, + transpose_b = true + } : (tensor<1x2048xf32>, tensor<1000x2048xf32>) -> tensor<1x1000xf32> + oneflow.return %res : tensor<1x1000xf32> +} + + +//CHECK-LABEL: test_relu +//CHECK: [[V0:%.+]] = "tosa.reluN"(%arg0) {max_fp = 3.40282347E+38 : f32, max_int = 9223372036854775807 : i64} : (tensor<1xf32>) -> tensor<1xf32> +//CHECK: return [[V0]] : tensor<1xf32> +oneflow.job @test_relu(%arg0: tensor<1xf32>) -> tensor<1xf32> { + %res = "oneflow.relu"(%arg0) + { + device_name = ["@0:0"], + device_tag = "cpu", + hierarchy = [1], + op_name = "", + output_lbns = [""], + scope_symbol_id = 4611686018427424767 : i64 + } : (tensor<1xf32>) -> tensor<1xf32> + oneflow.return %res : tensor<1xf32> +} + +//CHECK-LABEL: test_bn +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor} : () -> tensor +//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor) -> tensor<64x1x1xf32> +//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: return [[V10]] : tensor<1x64x112x112xf32> +oneflow.job @test_bn( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y, %mean, %inv_variance = "oneflow.normalization"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>) + oneflow.return %y: tensor<1x64x112x112xf32> +} + +//CHECK-LABEL: test_bn_infer +//CHECK: [[V0:%.+]] = "tosa.const"() {value = dense<9.99999974E-6> : tensor} : () -> tensor +//CHECK: [[V1:%.+]] = "tosa.reshape"(%arg1) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V2:%.+]] = "tosa.reshape"(%arg2) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V3:%.+]] = "tosa.reshape"(%arg3) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V4:%.+]] = "tosa.reshape"(%arg4) {new_shape = [64, 1, 1]} : (tensor<64xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V5:%.+]] = "tosa.sub"(%arg0, [[V1]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V6:%.+]] = "tosa.add"([[V2]], [[V0]]) : (tensor<64x1x1xf32>, tensor) -> tensor<64x1x1xf32> +//CHECK: [[V7:%.+]] = "tosa.rsqrt"([[V6]]) : (tensor<64x1x1xf32>) -> tensor<64x1x1xf32> +//CHECK: [[V8:%.+]] = "tosa.mul"([[V5]], [[V7]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V9:%.+]] = "tosa.mul"([[V8]], [[V3]]) {shift = 0 : i32} : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: [[V10:%.+]] = "tosa.add"([[V9]], [[V4]]) : (tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32> +//CHECK: return [[V10]] : tensor<1x64x112x112xf32> +oneflow.job @test_bn_infer( +%x: tensor<1x64x112x112xf32>, +%moving_mean: tensor<64xf32>, +%moving_variance: tensor<64xf32>, +%gamma: tensor<64xf32>, +%beta: tensor<64xf32>) -> tensor<1x64x112x112xf32> +{ + %y = "oneflow.normalization_infer"(%x, %moving_mean, %moving_variance, %gamma, %beta) + { + axis = 1 : si32, + device_name = ["@0:0"], + device_tag = "cpu", + epsilon = 9.99999974E-6 : f32, + hierarchy = [1], + momentum = 0.899999976 : f32, + op_name = "", + operand_segment_sizes = dense<[1, 1, 1, 1, 1, 0]> : vector<6xi32>, + output_lbns = ["", "", ""], + result_segment_sizes = dense<1> : vector<3xi32>, + scope_symbol_id = 4611686018427453439 : i64, + training = true + } : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x64x112x112xf32> + oneflow.return %y: tensor<1x64x112x112xf32> +} diff --git a/oneflow/ir/test/lit.cfg.py b/oneflow/ir/test/lit.cfg.py index af8af5b28af..275f16893d1 100644 --- a/oneflow/ir/test/lit.cfg.py +++ b/oneflow/ir/test/lit.cfg.py @@ -105,3 +105,10 @@ ] ) llvm_config.add_tool_substitutions(tools, tool_dirs) + +try: + import oneflow_iree.compiler + + config.WITH_ONEFLOW_IREE = True +except ImportError: + config.WITH_ONEFLOW_IREE = False diff --git a/oneflow/user/kernels/arg_where_kernel_util.cpp b/oneflow/user/kernels/arg_where_kernel_util.cpp index 25af71e776d..cc85cad8ad0 100644 --- a/oneflow/user/kernels/arg_where_kernel_util.cpp +++ b/oneflow/user/kernels/arg_where_kernel_util.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "oneflow/user/kernels/arg_where_kernel_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/kernel/kernel_util.h" namespace oneflow { diff --git a/oneflow/user/kernels/arg_where_kernel_util.cu b/oneflow/user/kernels/arg_where_kernel_util.cu index 61e6de4f543..522078e42ab 100644 --- a/oneflow/user/kernels/arg_where_kernel_util.cu +++ b/oneflow/user/kernels/arg_where_kernel_util.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "oneflow/user/kernels/arg_where_kernel_util.h" #include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/cuda/elementwise.cuh" #include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/ep/cuda/cuda_stream.h" diff --git a/oneflow/user/kernels/avg_pool_kernel_util.h b/oneflow/user/kernels/avg_pool_kernel_util.h index d6586bb70bf..d0b0ab8aeab 100644 --- a/oneflow/user/kernels/avg_pool_kernel_util.h +++ b/oneflow/user/kernels/avg_pool_kernel_util.h @@ -65,7 +65,7 @@ struct XPUAdd { OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) -typedef fixed_vector FixedDimVector; +typedef small_vector FixedDimVector; class AvgPoolParams3D { public: diff --git a/oneflow/user/kernels/dim_gather_kernels.cpp b/oneflow/user/kernels/dim_gather_kernels.cpp index 2efe0a937a3..efe197e4bc8 100644 --- a/oneflow/user/kernels/dim_gather_kernels.cpp +++ b/oneflow/user/kernels/dim_gather_kernels.cpp @@ -53,7 +53,7 @@ class DimGatherKernel final : public user_op::OpKernel { int dim_value = 0; if (ndim > 0) { dim_value = input_tensor->shape().At(dim); } - fixed_vector shape_vec(ndim); + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int64_t dim) -> IDX_T { return static_cast(dim); }); diff --git a/oneflow/user/kernels/dim_scatter_kernels.cpp b/oneflow/user/kernels/dim_scatter_kernels.cpp index 11ebbdc2e1c..df4721b6c3f 100644 --- a/oneflow/user/kernels/dim_scatter_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_kernels.cpp @@ -51,7 +51,7 @@ class DimScatterKernel final : public user_op::OpKernel { } const int ndim = src_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int32_t dim) -> IDX_T { return static_cast(dim); }); diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp index e67322b02d2..34fab14c90c 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernels.cpp @@ -49,7 +49,7 @@ class DimScatterScalarKernel final : public user_op::OpKernel { } const int ndim = out_tensor->shape().NumAxes(); - fixed_vector shape_vec(ndim); + small_vector shape_vec(ndim); auto shape2dims = [&shape_vec, &ndim](const ShapeView& tensor_shape) -> void { std::transform(tensor_shape.ptr(), tensor_shape.ptr() + ndim, shape_vec.begin(), [](int32_t dim) -> IDX_T { return static_cast(dim); }); diff --git a/oneflow/user/kernels/gather_kernel_util.cpp b/oneflow/user/kernels/gather_kernel_util.cpp index bec965bdf12..88705ca4bff 100644 --- a/oneflow/user/kernels/gather_kernel_util.cpp +++ b/oneflow/user/kernels/gather_kernel_util.cpp @@ -85,7 +85,7 @@ void GatherKernelUtilImpl::Forward(ep::Stream* stream, c const T* from = in + outer_idx * gather_dim_size * inner_dim_size + idx * inner_dim_size; std::copy(from, from + inner_dim_size, to); } else { - std::memset(reinterpret_cast(to), 0, inner_dim_size * sizeof(K)); + std::memset(reinterpret_cast(to), 0, inner_dim_size * sizeof(T)); } } } diff --git a/oneflow/user/kernels/image_preprocess_kernels.cu b/oneflow/user/kernels/image_preprocess_kernels.cu index 30fda3bd96d..2b2e287e69c 100644 --- a/oneflow/user/kernels/image_preprocess_kernels.cu +++ b/oneflow/user/kernels/image_preprocess_kernels.cu @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/fixed_vector.h" +#include "oneflow/core/common/small_vector.h" #include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/ep/cuda/cuda_stream.h" diff --git a/oneflow/user/kernels/max_pool_kernel_util.h b/oneflow/user/kernels/max_pool_kernel_util.h index c62bf3bd3cd..821aa2020e0 100644 --- a/oneflow/user/kernels/max_pool_kernel_util.h +++ b/oneflow/user/kernels/max_pool_kernel_util.h @@ -41,7 +41,7 @@ namespace oneflow { #define POOL_DATA_TYPE_CUDA_SEQ POOL_DATA_TYPE_SEQ -typedef fixed_vector FixedDimVector; +typedef small_vector FixedDimVector; template struct DeviceAdd { diff --git a/oneflow/user/ops/arange_op.cpp b/oneflow/user/ops/arange_op.cpp index 225f3fa37cf..73585347376 100644 --- a/oneflow/user/ops/arange_op.cpp +++ b/oneflow/user/ops/arange_op.cpp @@ -84,8 +84,9 @@ namespace oneflow { const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; diff --git a/oneflow/user/ops/constant_op.cpp b/oneflow/user/ops/constant_op.cpp index 8cf14f3b22f..62d9bdcc050 100644 --- a/oneflow/user/ops/constant_op.cpp +++ b/oneflow/user/ops/constant_op.cpp @@ -29,8 +29,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/distributions/normal_op.cpp b/oneflow/user/ops/distributions/normal_op.cpp index 5af64e0d3bb..736a70e5d0b 100644 --- a/oneflow/user/ops/distributions/normal_op.cpp +++ b/oneflow/user/ops/distributions/normal_op.cpp @@ -32,8 +32,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/distributions/uniform_int_op.cpp b/oneflow/user/ops/distributions/uniform_int_op.cpp index 9e79e69c4e5..f01bb710f3c 100644 --- a/oneflow/user/ops/distributions/uniform_int_op.cpp +++ b/oneflow/user/ops/distributions/uniform_int_op.cpp @@ -35,8 +35,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/distributions/uniform_op.cpp b/oneflow/user/ops/distributions/uniform_op.cpp index 206a27426d8..b7d566aac49 100644 --- a/oneflow/user/ops/distributions/uniform_op.cpp +++ b/oneflow/user/ops/distributions/uniform_op.cpp @@ -35,8 +35,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); diff --git a/oneflow/user/ops/eager_nccl_ops.cpp b/oneflow/user/ops/eager_nccl_ops.cpp index 1399ea4d97a..bd4cdda1367 100644 --- a/oneflow/user/ops/eager_nccl_ops.cpp +++ b/oneflow/user/ops/eager_nccl_ops.cpp @@ -133,8 +133,9 @@ namespace oneflow { const Shape& parallel_hierarchy = *ctx->parallel_desc().hierarchy(); const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, in_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *out_shape = physical_shape; } else { *out_shape = in_shape; diff --git a/oneflow/user/ops/empty_op.cpp b/oneflow/user/ops/empty_op.cpp index f2060b49950..4489902d730 100644 --- a/oneflow/user/ops/empty_op.cpp +++ b/oneflow/user/ops/empty_op.cpp @@ -30,8 +30,9 @@ namespace oneflow { const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); const Shape& logical_shape = ctx->Attr("shape"); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; *ctx->OutputStride("out", 0) = Stride(physical_shape); diff --git a/oneflow/user/ops/image_preprocess_ops.cpp b/oneflow/user/ops/image_preprocess_ops.cpp index 6e72ccda1a5..00c6d419c8b 100644 --- a/oneflow/user/ops/image_preprocess_ops.cpp +++ b/oneflow/user/ops/image_preprocess_ops.cpp @@ -156,8 +156,9 @@ namespace oneflow { const Shape logical_shape = Shape({batch_size}); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; return Maybe::Ok(); } diff --git a/oneflow/user/ops/randperm_op.cpp b/oneflow/user/ops/randperm_op.cpp index c7e83402b86..aa6103a2f0d 100644 --- a/oneflow/user/ops/randperm_op.cpp +++ b/oneflow/user/ops/randperm_op.cpp @@ -39,8 +39,9 @@ namespace oneflow { int32_t n = ctx->Attr("n"); const Shape& logical_shape = Shape({n}); const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const Shape& physical_shape = - GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id).shape(); + const auto tensor_slice_view = + GetTensorSliceView4ParallelId(parallel_hierarchy, nd_sbp, logical_shape, parallel_id); + const Shape& physical_shape = tensor_slice_view.shape(); *ctx->OutputShape("out", 0) = physical_shape; diff --git a/oneflow/user/ops/stack_op.cpp b/oneflow/user/ops/stack_op.cpp index 254cbcd1743..1dd129081bd 100644 --- a/oneflow/user/ops/stack_op.cpp +++ b/oneflow/user/ops/stack_op.cpp @@ -144,8 +144,6 @@ Maybe GenGradOp(const user_op::UserOpWrapper& op, const user_op::AddOpFn& /*static*/ Maybe StackGradOp::GetSbp(user_op::SbpContext* ctx) { const auto axis = ctx->Attr("axis"); - const int64_t in_num_axes = - ctx->LogicalTensorDesc4InputArgNameAndIndex("in", 0).shape().NumAxes(); const int64_t like_num_axes = ctx->LogicalTensorDesc4InputArgNameAndIndex("like", 0).shape().NumAxes(); FOR_RANGE(int64_t, i, 0, like_num_axes) { diff --git a/oneflow/user/utils/pool_util.h b/oneflow/user/utils/pool_util.h index 9a21f8a9129..4deed023f1f 100644 --- a/oneflow/user/utils/pool_util.h +++ b/oneflow/user/utils/pool_util.h @@ -21,8 +21,8 @@ limitations under the License. namespace oneflow { -typedef fixed_vector FixedDimVector; -typedef fixed_vector FixedVector; +typedef small_vector FixedDimVector; +typedef small_vector FixedVector; class Params3D { public: diff --git a/python/oneflow/framework/multi_client_session.py b/python/oneflow/framework/multi_client_session.py index 72c6e093779..64a82c12b27 100644 --- a/python/oneflow/framework/multi_client_session.py +++ b/python/oneflow/framework/multi_client_session.py @@ -124,4 +124,7 @@ def update_resource_eagerly(self, resource_config): self._session_ctx.update_resource(config_proto_str) def __del__(self): + if self._env.is_shutting_down(): + # After python shutting down, it's not safe to call oneflow + return self._TryClose() diff --git a/python/oneflow/nn/graph/graph_config.py b/python/oneflow/nn/graph/graph_config.py index dfb3795b3ac..ea48ad8d957 100644 --- a/python/oneflow/nn/graph/graph_config.py +++ b/python/oneflow/nn/graph/graph_config.py @@ -17,6 +17,7 @@ from collections import OrderedDict +import oneflow.boxing.nccl as nccl_config from oneflow.nn.graph.optimizer import OptDict import oneflow.core.job.job_conf_pb2 as job_conf_pb @@ -45,24 +46,51 @@ def training(self): return False raise NotImplementedError - def set_outputs_buffer_size(self, value: int = 2): - r"""Set the outputs buffer size of ``nn.Graph``. + def enable_amp(self, mode: bool = True): + r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training. - When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time. + For example: - The default outputs buffer size is 2. + .. code-block:: python - # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code. - # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap. - # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time. + import oneflow as flow + + class Graph(flow.nn.Graph): + def __init__(self): + super().__init__() + self.linear = flow.nn.Linear(3, 8, False) + self.config.enable_amp(True) # Use mixed precision mode. + def build(self, x): + return self.linear(x) + + graph = Graph() Args: - value (int): graph ouputs buffer size. + mode (bool, optional): The default vaule is True. + """ - self._outputs_buffer_size = value + assert type(mode) is bool + self.proto.enable_auto_mixed_precision = mode - def enable_amp(self, mode: bool = True): - r"""If set to true, then graph will use mixed precision mode, it means use both float16 and float32 during model training. + def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"): + raise RuntimeError( + "`set_zero_redundancy_optimizer_mode` has been changed to `enable_zero`, please use `enable_zero(True)` to activate ZeRO optimization." + ) + + def enable_zero( + self, + mode: bool = True, + *, + stage: int = 2, + shard_min_size: int = 1024, + shard_restore_level: int = 1, + ): + r"""Enable ZeRO redundancy optimizer. + + This optimzation will reduce optimizer states memory consumption as described + by ZeRO https://arxiv.org/abs/1910.02054 . + + The default zero stage is 2. For example: @@ -74,17 +102,36 @@ class Graph(flow.nn.Graph): def __init__(self): super().__init__() self.linear = flow.nn.Linear(3, 8, False) - self.config.enable_amp(True) # Use mixed precision mode. + self.config.enable_zero() def build(self, x): return self.linear(x) graph = Graph() Args: - mode (bool, optional): The default vaule is True. + mode (bool): if set to true, optimizer states of Data Parallel will be sharded across devices. + stage (int): optimization stage, range from 1 to 3. + shard_min_size (int): min size of a shard of an optimizer state. + shard_restore_level (int): level to restore sharded parameter to whole parameter for consumer operators, level 0 is no restore, level 1 is soft restore, level 2 is hard restore. Note that this paremeter is at pre-alpha stage. """ - assert type(mode) is bool - self.proto.enable_auto_mixed_precision = mode + if not mode: + self.proto.optimizer_placement_optimization_mode = "none" + return + assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3." + assert ( + shard_min_size > 0 + ), "ZeRO min size of a sharded optimizer state must > 0." + assert stage >= 1 and stage <= 3, "ZeRO stage must range form 1 to 3." + if stage >= 1: + self.proto.optimizer_placement_optimization_mode = "distributed_split" + self.proto.optimizer_placement_optimization_threshold = shard_min_size + self.proto.optimizer_placement_optimization_shard_restore_level = ( + shard_restore_level + ) + if stage >= 2: + nccl_config.enable_use_compute_stream(True) + if stage >= 3: + nccl_config.disable_group_boxing_by_dst_parallel(True) def allow_fuse_model_update_ops(self, mode: bool = True): r"""If set to true, try to fuse cast + scale + l1_l2_regularize_gradient + model_update to one op to improve performance. @@ -188,61 +235,23 @@ def build(self, x): """ self.proto.num_gradient_accumulation_steps = value - def set_zero_redundancy_optimizer_mode(self, mode: str = "distributed_split"): - r"""Set mode to remove redundancy of optimizer states. - This optimzation will reduce optimizer states memory consumption as described - by ZeRO https://arxiv.org/abs/1910.02054 . - - For example: - - .. code-block:: python - - import oneflow as flow - - class Graph(flow.nn.Graph): - def __init__(self): - super().__init__() - self.linear = flow.nn.Linear(3, 8, False) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - def build(self, x): - return self.linear(x) - - graph = Graph() - - Args: - mode (str): "distributed_split" or "non_distributed". "distributed_split" mode - will shard each optimizer state across devices. "non_distributed" mode - will place each optimizer state to only one device. - """ - assert mode in ("distributed_split", "non_distributed") - self.proto.optimizer_placement_optimization_mode = mode - - def set_zero_redundancy_optimizer_min_size_after_split(self, value): - r"""Set the min size of optimizer state/grad/parameter after split. - - For example: - - .. code-block:: python + def set_outputs_buffer_size(self, value: int = 2): + r"""Set the outputs buffer size of ``nn.Graph``. - import oneflow as flow + When graph's outputs buffer size is greater than 2, multiple call on the graph can work like a pipeline. This makes multiple call takes less time. - class Graph(flow.nn.Graph): - def __init__(self): - super().__init__() - self.linear = flow.nn.Linear(3, 8, False) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - def build(self, x): - return self.linear(x) + The default outputs buffer size is 2. - graph = Graph() + # TODO (lixiang): Explain the meaning of the size of buffer size and add sample code. + # The size of the buffer size indicates the maximum number of iterations that the output of the Graph and the Graph actually executed asynchronously can overlap. + # If the buffer size is 1, there is no pipeline. A size of 2 means that it can execute 1 iter ahead of time. A size of 3 means that two iters can be executed ahead of time. Args: - value (int): min size value. + value (int): graph ouputs buffer size. """ assert isinstance(value, int) assert value >= 1 - self.proto.optimizer_placement_optimization_threshold = value + self._outputs_buffer_size = value def enable_cudnn_conv_heuristic_search_algo(self, mode: bool = True): r""" Whether enable cudnn conv operatioin to use heuristic search algorithm. diff --git a/python/oneflow/nn/modules/sparse.py b/python/oneflow/nn/modules/sparse.py index 0731f7b21b9..b8eb4d50b9a 100644 --- a/python/oneflow/nn/modules/sparse.py +++ b/python/oneflow/nn/modules/sparse.py @@ -160,9 +160,12 @@ def forward(self, indices): flow._C.embedding_renorm_( self.weight, indices, self.max_norm, self.norm_type ) - return flow._C.embedding( - self.weight, indices, self.padding_idx, self.scale_grad_by_freq - ) + if self.padding_idx is None and not self.scale_grad_by_freq: + return flow._C.gather(self.weight, indices, axis=0) + else: + return flow._C.embedding( + self.weight, indices, self.padding_idx, self.scale_grad_by_freq + ) def embedding( @@ -232,7 +235,10 @@ def embedding( with flow.no_grad(): weight = flow._C.embedding_renorm_(weight, input, max_norm, norm_type) - return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq) + if padding_idx is None and not scale_grad_by_freq: + return flow._C.gather(weight, input, axis=0) + else: + return flow._C.embedding(weight, input, padding_idx, scale_grad_by_freq) if __name__ == "__main__": diff --git a/python/oneflow/test/graph/test_graph_zero.py b/python/oneflow/test/graph/test_graph_zero.py index 20fc7366bab..51fa38a8657 100644 --- a/python/oneflow/test/graph/test_graph_zero.py +++ b/python/oneflow/test/graph/test_graph_zero.py @@ -26,40 +26,42 @@ def train_with_graph(iter_num=1): P = flow.placement("cuda", ranks=[0, 1]) B = flow.sbp.broadcast S0 = flow.sbp.split(0) - linear = flow.nn.Linear(8, 4) - linear = linear.to_global(placement=P, sbp=B) - flow.nn.init.constant_(linear.weight, 2.068758) - flow.nn.init.constant_(linear.bias, 0.23) - of_sgd = flow.optim.SGD(linear.parameters(), lr=0.001, momentum=0.9) + + linear_dp = flow.nn.Linear(800, 400, bias=False) + linear_dp = linear_dp.to_global(placement=P, sbp=B) + flow.nn.init.constant_(linear_dp.weight, 2.068758) + + linear_mp = flow.nn.Linear(400, 500, bias=False) + linear_mp = linear_mp.to_global(placement=P, sbp=S0) + flow.nn.init.constant_(linear_mp.weight, 2.068758) + + of_sgd = flow.optim.SGD( + [{"params": linear_dp.parameters()}, {"params": linear_mp.parameters()}], + lr=0.001, + momentum=0.9, + ) grad_scaler = flow.amp.StaticGradScaler(200) - x = flow.randint(1, 100, (4, 8), dtype=flow.float32, placement=P, sbp=S0) + x = flow.randint(1, 100, (6, 800), dtype=flow.float32, placement=P, sbp=S0) class LinearTrainGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() - self.linear = linear + self.linear_dp = linear_dp + self.linear_mp = linear_mp self.add_optimizer(of_sgd) self.config.enable_amp(True) self.set_grad_scaler(grad_scaler) - if zero_stage == 1: - print("zero stage 1 optimization") - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - if zero_stage == 2: - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - flow.boxing.nccl.enable_use_compute_stream(True) - if zero_stage == 3: - print("zero stage 3 optimization") - self.config.set_zero_redundancy_optimizer_mode("distributed_split") - self.config.set_zero_redundancy_optimizer_min_size_after_split(1) - flow.boxing.nccl.enable_use_compute_stream(True) - flow.boxing.nccl.disable_group_boxing_by_dst_parallel(True) + self.config.enable_zero( + True, stage=zero_stage, shard_min_size=1, shard_restore_level=0, + ) + self.debug(2) def build(self, x): - out = self.linear(x) + out = self.linear_dp(x) + out = out.to_global(placement=P, sbp=B) + out = self.linear_mp(out) loss = out.sum() loss.backward() return out @@ -67,19 +69,26 @@ def build(self, x): class LinearEvalGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() - self.linear = linear + self.linear_dp = linear_dp + self.linear_mp = linear_mp self.config.enable_amp(True) def build(self, x): - out = self.linear(x) + out = self.linear_dp(x) + out = out.to_global(placement=P, sbp=B) + out = self.linear_mp(out) return out linear_t_g = LinearTrainGraphWithZeRO() + linear_t_g.debug(1) linear_e_g = LinearEvalGraphWithZeRO() + linear_e_g.debug(1) def one_train_iter(): out = linear_t_g(x) + if flow.env.get_rank() == 0: + print(linear_t_g) def one_eval_iter(): out = linear_e_g(x) @@ -89,8 +98,116 @@ def one_eval_iter(): # After pass rewrite in training graph, parameters' sbp has been # changed from flow.sbp.broadcast to flow.sbp.split(0) - test_case.assertEqual(linear.weight.sbp[0], S0) - test_case.assertEqual(linear.bias.sbp[0], S0) + test_case.assertEqual(linear_dp.weight.sbp[0], S0) + test_case.assertEqual(linear_mp.weight.sbp[0], S0) + + # In evaluation graph, paramters's sbp are flow.sbp.split(0). + # But their consumer will consum them as flow.sbp.broadcast. + one_eval_iter() + + iter_num = 1 + graph_check_list = train_with_graph(iter_num) + + +def _test_linear_train_graph_2d_with_zero(test_case, zero_stage=1): + def train_with_graph(iter_num=1): + P = flow.placement("cuda", ranks=[[0, 1], [2, 3]]) + B = flow.sbp.broadcast + S0 = flow.sbp.split(0) + S1 = flow.sbp.split(1) + + def get_mixed_linear(): + linear_dp_mp = flow.nn.Linear(800, 400, bias=False) + linear_dp_mp = linear_dp_mp.to_global(placement=P, sbp=[B, S0]) + flow.nn.init.constant_(linear_dp_mp.weight, 1.068758) + + linear_mp_dp = flow.nn.Linear(800, 400, bias=False) + linear_mp_dp = linear_mp_dp.to_global(placement=P, sbp=[S0, B]) + flow.nn.init.constant_(linear_mp_dp.weight, 1.068758) + + class MixedLinear(flow.nn.Module): + def __init__(self): + super().__init__() + self.dp_mp = linear_dp_mp + self.mp_dp = linear_mp_dp + + def forward(self, x): + x = self.dp_mp(x) + x = flow.relu(x) + x = self.mp_dp(x) + x = flow.relu(x) + return x + + return MixedLinear() + + mixed_linear0 = get_mixed_linear() + mixed_linear1 = get_mixed_linear() + + of_sgd = flow.optim.SGD( + [ + {"params": mixed_linear0.parameters()}, + {"params": mixed_linear1.parameters()}, + ], + lr=0.001, + momentum=0.9, + ) + grad_scaler = flow.amp.StaticGradScaler(200) + + x = flow.rand((2, 800), dtype=flow.float32, placement=P, sbp=[S0, B]) + + class LinearTrainGraph2DWithZeRO(flow.nn.Graph): + def __init__(self): + super().__init__() + self.mixed_linear0 = mixed_linear0 + self.mixed_linear0.config.activation_checkpointing = True + self.mixed_linear1 = mixed_linear1 + self.mixed_linear1.config.activation_checkpointing = True + self.add_optimizer(of_sgd) + + self.config.enable_amp(True) + self.set_grad_scaler(grad_scaler) + self.config.enable_zero( + True, stage=zero_stage, shard_min_size=1, shard_restore_level=1, + ) + + def build(self, x): + out = self.mixed_linear0(x) + out = self.mixed_linear1(out) + loss = out.mean() + loss.backward() + return loss + + class LinearEvalGraph2DWithZeRO(flow.nn.Graph): + def __init__(self): + super().__init__() + self.mixed_linear0 = mixed_linear0 + self.mixed_linear1 = mixed_linear1 + + self.config.enable_amp(True) + + def build(self, x): + out = self.mixed_linear0(x) + out = self.mixed_linear1(out) + return out + + linear_t_g = LinearTrainGraph2DWithZeRO() + linear_e_g = LinearEvalGraph2DWithZeRO() + + def one_train_iter(): + out = linear_t_g(x) + # if flow.env.get_rank() == 0: + # print(linear_t_g) + + def one_eval_iter(): + out = linear_e_g(x) + + for i in range(iter_num): + one_train_iter() + + for state in linear_t_g._state(): + test_case.assertEqual( + state.origin.sbp, (oneflow.sbp.split(axis=0), oneflow.sbp.split(axis=0)) + ) # In evaluation graph, paramters's sbp are flow.sbp.split(0). # But their consumer will consum them as flow.sbp.broadcast. @@ -113,5 +230,18 @@ def test_linear_train_graph_with_zero_3(test_case): _test_linear_train_graph_with_zero(test_case, 3) +@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") +@flow.unittest.skip_unless_1n4d() +class TestLinearTrainGraph2DWithZeRO(oneflow.unittest.TestCase): + def test_linear_train_graph_2d_with_zero_3(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 3) + + def test_linear_train_graph_2d_with_zero_2(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 2) + + def test_linear_train_graph_2d_with_zero_1(test_case): + _test_linear_train_graph_2d_with_zero(test_case, 1) + + if __name__ == "__main__": unittest.main() diff --git a/python/oneflow/test/graph/test_optimization_conf.py b/python/oneflow/test/graph/test_optimization_conf.py index da6348b7033..a60d339be8b 100644 --- a/python/oneflow/test/graph/test_optimization_conf.py +++ b/python/oneflow/test/graph/test_optimization_conf.py @@ -66,7 +66,7 @@ def __init__(self): self.config.allow_fuse_add_to_output(True) self.config.allow_fuse_cast_scale(True) self.config.set_gradient_accumulation_steps(100) - self.config.set_zero_redundancy_optimizer_mode("distributed_split") + self.config.enable_zero(True) self.config.enable_cudnn_conv_heuristic_search_algo(False) def build(self, x): diff --git a/python/oneflow/test/modules/test_sparse.py b/python/oneflow/test/modules/test_sparse.py index 2a1ed9812db..01df8da23bb 100644 --- a/python/oneflow/test/modules/test_sparse.py +++ b/python/oneflow/test/modules/test_sparse.py @@ -184,7 +184,7 @@ def test_embedding_functional(test_case): # NOTE(Yao Zihang): Set check_graph=False temporarily # Graph mode do not support inplace op with flow.no_grad() # See this issue: https://github.com/Oneflow-Inc/OneTeam/issues/1382 - @autotest(n=5, check_graph="ValidatedFlase") + @autotest(n=5, rtol=1e-03, atol=1e-03, check_graph="ValidatedFlase") def test_embedding_renorm(test_case): device = random_device() emb_size = random(low=2) * 16