Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/eager tensor to graph out and inplace #7254

Merged
merged 21 commits into from
Jan 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c47fda2
feat(Parameter): Parameter support both inplace op and setter
wyg1997 Jan 13, 2022
4c47583
feat(Tensor): tensor support data's getter interface
wyg1997 Jan 13, 2022
94595a2
test(Parameter): add getter test
wyg1997 Jan 13, 2022
49e3193
debug
strint Jan 13, 2022
abc42e1
add test
strint Jan 13, 2022
bef91d5
open flatten graph test
strint Jan 13, 2022
b2f4c33
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
strint Jan 13, 2022
6149ce6
add validated flase type
strint Jan 13, 2022
fe6e92e
Merge branch 'feat/eager_tensor_to_graph_out_and_inplace' of https://…
strint Jan 13, 2022
e111000
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
strint Jan 14, 2022
6121b3c
refine
strint Jan 14, 2022
bb428d3
Merge branch 'feat/eager_tensor_to_graph_out_and_inplace' of https://…
strint Jan 14, 2022
713400d
foramt
strint Jan 14, 2022
403a77b
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
strint Jan 14, 2022
12eb8b9
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
oneflow-ci-bot Jan 14, 2022
91897ad
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
strint Jan 15, 2022
e7ed225
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
oneflow-ci-bot Jan 15, 2022
a18830a
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
oneflow-ci-bot Jan 15, 2022
bbcab0a
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
strint Jan 17, 2022
997c85f
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
oneflow-ci-bot Jan 17, 2022
d616df4
Merge branch 'master' into feat/eager_tensor_to_graph_out_and_inplace
oneflow-ci-bot Jan 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 58 additions & 47 deletions oneflow/core/framework/op_interpreter/lazy_op_interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <memory>
#include "oneflow/core/common/cpp_attribute.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/common/cpp_attribute.h"
Expand Down Expand Up @@ -190,6 +191,51 @@ Maybe<Scope> NewScopeWithParallelDescByTensor(const std::shared_ptr<Tensor>& ten
return NewScopeWithParallelConfAndCurScope(parallel_conf);
}

Maybe<void> AddFreeEagerTensorToVariableOp(const std::shared_ptr<Tensor>& input_tensor) {
CHECK_OR_RETURN(input_tensor->is_eager());
const std::string& empty_lbn = TensorNameScope::Global()->Lookup(input_tensor);
CHECK_OR_RETURN(empty_lbn.empty());
std::shared_ptr<Scope> scope = JUST(NewScopeWithParallelDescByTensor(input_tensor));
OperatorConf op_conf;
op_conf.set_scope_symbol_id(JUST(scope->symbol_id()));
op_conf.set_device_tag(GetDeviceTagOfTensor(input_tensor));
VariableOpConf* var_conf = op_conf.mutable_variable_conf();
var_conf->set_out("out");
input_tensor->shape()->ToProto(var_conf->mutable_shape());
var_conf->set_data_type(input_tensor->dtype()->data_type());
// NOTE(chengcheng): VariableOpConf initializer_conf is useless because variable is inited
// by EagerTensor.
var_conf->mutable_initializer()->mutable_empty_conf();
JUST(GenVariableOpConfNdSbpStringByTensor(var_conf, input_tensor));
// NOTE(chengcheng): Free EagerTensor not trainable
var_conf->set_trainable(false);

auto infer_ctx = JUST(GetCurInferCtx());
// NOTE(chengcheng): MUST reset unique op name before InferCtx::AddOp, FreeEagerTensor has no
// name so just new a unique name for it.
const std::string new_op_name = *JUST(infer_ctx->NewUniqueOpNameByFunctionalOpConf(op_conf));
op_conf.set_name(new_op_name);

VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() << " try to add op: \n"
<< op_conf.DebugString() << std::endl;
OpAttribute op_attr = *JUST(infer_ctx->AddAndInferConsistentOp(op_conf));
VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() << " add op : \n"
<< op_conf.DebugString() << " for FreeEagerTensor.\n";
VLOG(3) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name()
<< " infer and and op attr : \n"
<< op_attr.DebugString() << " for FreeEagerTensor.\n";

// NOTE(chengcheng): MUST store this tensor to MultiClientSessionContext for graph runtime bind.
const std::string graph_name = *JUST(JUST(GlobalJobBuildAndInferCtxMgr())->GetCurrentJobName());
const std::string lbn = GenLogicalBlobName(new_op_name, "out");
Global<MultiClientSessionContext>::Get()->StoreFreeEagerTensorWithNameByGraphName(
graph_name, input_tensor, new_op_name);
// NOTE(chengcheng): MUST record this eager_tensor name as new variable output lbn.
TensorNameScope::Global()->Record(input_tensor, lbn);

return Maybe<void>::Ok();
}

Maybe<void> LazyInterpreter::ApplyImpl(const FeedInputOpExpr& op_expr, const TensorTuple& inputs,
TensorTuple* outputs, const OpExprInterpContext& ctx) const {
// NOTE(chengcheng): inputs[0] is the EagerTensor
Expand Down Expand Up @@ -308,8 +354,16 @@ Maybe<void> LazyInterpreter::ApplyImpl(const FetchOutputOpExpr& op_expr, const T
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(op_expr.input_size(), 1);
const std::shared_ptr<Tensor>& input_tensor = inputs.at(0);
CHECK_OR_RETURN(input_tensor->is_lazy());
const std::string& input_lbn = TensorNameScope::Global()->Lookup(input_tensor);
std::string input_lbn = TensorNameScope::Global()->Lookup(input_tensor);
// Lazy tensor must has lbn.
// Eager tensor may has lbn if it has already been treated as an output of a variable op
// or an output of an inplace op.
if (input_lbn.empty()) {
CHECK_OR_RETURN(input_tensor->is_eager());
// This output tensor is a new free eager tensor, so treat it as a new variable op output.
JUST(AddFreeEagerTensorToVariableOp(input_tensor));
input_lbn = TensorNameScope::Global()->Lookup(input_tensor);
}
CHECK_OR_RETURN(!input_lbn.empty()); // lbn must exist.

std::shared_ptr<Scope> scope = JUST(NewScopeWithParallelDescByTensor(input_tensor));
Expand Down Expand Up @@ -490,51 +544,6 @@ Maybe<void> LazyInterpreterApplyImplForSourceUserOpExpr(const UserOpExpr& op_exp
return Maybe<void>::Ok();
}

Maybe<void> AddFreeEagerTensorToVariableOp(const std::shared_ptr<Tensor>& input_tensor) {
CHECK_OR_RETURN(input_tensor->is_eager());
const std::string& empty_lbn = TensorNameScope::Global()->Lookup(input_tensor);
CHECK_OR_RETURN(empty_lbn.empty());
std::shared_ptr<Scope> scope = JUST(NewScopeWithParallelDescByTensor(input_tensor));
OperatorConf op_conf;
op_conf.set_scope_symbol_id(JUST(scope->symbol_id()));
op_conf.set_device_tag(GetDeviceTagOfTensor(input_tensor));
VariableOpConf* var_conf = op_conf.mutable_variable_conf();
var_conf->set_out("out");
input_tensor->shape()->ToProto(var_conf->mutable_shape());
var_conf->set_data_type(input_tensor->dtype()->data_type());
// NOTE(chengcheng): VariableOpConf initializer_conf is useless because variable is inited
// by EagerTensor.
var_conf->mutable_initializer()->mutable_empty_conf();
JUST(GenVariableOpConfNdSbpStringByTensor(var_conf, input_tensor));
// NOTE(chengcheng): Free EagerTensor not trainable
var_conf->set_trainable(false);

auto infer_ctx = JUST(GetCurInferCtx());
// NOTE(chengcheng): MUST reset unique op name before InferCtx::AddOp, FreeEagerTensor has no
// name so just new a unique name for it.
const std::string new_op_name = *JUST(infer_ctx->NewUniqueOpNameByFunctionalOpConf(op_conf));
op_conf.set_name(new_op_name);

VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() << " try to add op: \n"
<< op_conf.DebugString() << std::endl;
OpAttribute op_attr = *JUST(infer_ctx->AddAndInferConsistentOp(op_conf));
VLOG(2) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name() << " add op : \n"
<< op_conf.DebugString() << " for FreeEagerTensor.\n";
VLOG(3) << "Lazy nn.Graph name " << infer_ctx->job().job_conf().job_name()
<< " infer and and op attr : \n"
<< op_attr.DebugString() << " for FreeEagerTensor.\n";

// NOTE(chengcheng): MUST store this tensor to MultiClientSessionContext for graph runtime bind.
const std::string graph_name = *JUST(JUST(GlobalJobBuildAndInferCtxMgr())->GetCurrentJobName());
const std::string lbn = GenLogicalBlobName(new_op_name, "out");
Global<MultiClientSessionContext>::Get()->StoreFreeEagerTensorWithNameByGraphName(
graph_name, input_tensor, new_op_name);
// NOTE(chengcheng): MUST record this eager_tensor name as new variable output lbn.
TensorNameScope::Global()->Record(input_tensor, lbn);

return Maybe<void>::Ok();
}

Maybe<void> LazyInterpreterApplyImplForCopyUserOpExpr(const UserOpExpr& op_expr,
const TensorTuple& inputs,
TensorTuple* outputs,
Expand Down Expand Up @@ -683,6 +692,8 @@ Maybe<void> LazyInterpreter::ApplyImpl(const UserOpExpr& op_expr, const TensorTu
(*outputs)[i] =
JUST(BuildTensor(op_attr, obn, blob_parallel_desc, /* is_lazy= */ true, is_local));
} else {
VLOG(2) << "Lazy nn.Graph name " << graph_name << " op name " << new_op_name
<< " run with inplace.";
const std::shared_ptr<Tensor>& inplace_out = (*outputs)[i];
JUST(CheckTensorMatchAttr(inplace_out, op_attr, obn, blob_parallel_desc, is_local));
}
Expand Down
1 change: 0 additions & 1 deletion python/oneflow/framework/graph_build_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ def build_graph_state(op_name, state_tensor, state_config):

def build_graph_output(op_name, out):
assert isinstance(out, Tensor)
assert out.is_lazy

output_conf = (
oneflow._oneflow_internal.oneflow.core.operator.op_conf.FetchOutputOpConf()
Expand Down
63 changes: 63 additions & 0 deletions python/oneflow/test/graph/test_graph_free_eager_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,69 @@ def build(self):
np.allclose(mul_out.numpy(), np_x * np_y, atol=1e-4, rtol=1e-4)
)

def test_graph_return_free_eager_tensor(test_case):
np_x = np.random.randn(5, 3)
x = flow.tensor(np_x, dtype=flow.float32)

class GraphReturnEager(flow.nn.Graph):
def __init__(self):
super().__init__()

def build(self):
# Return free eager tensor
return x

g_return_eager = GraphReturnEager()

# Run first time
ret_eager_out = g_return_eager()
test_case.assertTrue(
np.allclose(ret_eager_out.numpy(), np_x, atol=1e-4, rtol=1e-4)
)

# Run second time
ret_eager_out1 = g_return_eager()
test_case.assertTrue(
np.allclose(ret_eager_out1.numpy(), np_x, atol=1e-4, rtol=1e-4)
)

def test_graph_return_inplace_free_eager_tensor(test_case):
np_x = np.random.randn(5, 3)
x = flow.tensor(np_x, dtype=flow.float32)

class GraphInplaceReturnEager(flow.nn.Graph):
def __init__(self):
super().__init__()

def build(self):
# x is free eager tensor
# mul_ is inplace scalar mul
# Input and output of mul_ are both tensor x
# After lazy interpretr, tensor x's name will be the ouput lbn of mul_
x.mul_(2)
# Here will return the output of mul_
return x

g_return_eager = GraphInplaceReturnEager()

# Run first time
ret_eager_out = g_return_eager()
# x in ouput changed
# So nn.Graph simulate inplace in nn.Graph.build().
test_case.assertTrue(
np.allclose(ret_eager_out.numpy(), np_x * 2, atol=1e-4, rtol=1e-4)
)
# x has not changed
# So nn.Graph inplace will not change free eager tensor.
test_case.assertTrue(np.allclose(x.numpy(), np_x, atol=1e-4, rtol=1e-4))

# Run second time
ret_eager_out = g_return_eager()
test_case.assertTrue(
np.allclose(ret_eager_out.numpy(), np_x * 2, atol=1e-4, rtol=1e-4)
)
test_case.assertTrue(np.allclose(x.numpy(), np_x, atol=1e-4, rtol=1e-4))


@unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases")
@flow.unittest.skip_unless_1n2d()
Expand Down
2 changes: 1 addition & 1 deletion python/oneflow/test/modules/test_flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_flatten_module_with_random_data(test_case):
y = m(x)
return y

@autotest(check_graph=False)
@autotest(check_graph=True)
def test_flatten_with_random_data(test_case):
device = random_device()
x = random_pytorch_tensor().to(device)
Expand Down
6 changes: 5 additions & 1 deletion python/oneflow/test/tensor/test_parameter.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ def test_parameter_set_data_autograd_meta(test_case):
z.data = y
return z.grad_fn, z.is_leaf

@autotest(n=1, check_graph=False)
# Not check graph because of 2 reason.
# Reason 1, x.data return a new tensor but share storage with the origin tensor, this is not well dealed in nn.Graph.
# Reason 2, inplace operation mul_ can works well inside nn.Graph but will not change the value in free eager tensor.
# Please refer to test case: test_graph_return_inplace_free_eager_tensor
@autotest(n=1, check_graph="ValidatedFlase")
def test_parameter_inplace_modify_data(test_case):
x = torch.nn.Parameter(torch.ones(2, 3))
x.data.mul_(2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def build(self, *args):
test_g = TestGraphOfModule()
if verbose:
print("Run graph of module: ", repr(oneflow))
test_g.debug(2)
test_g.debug(3)
test_g_res = test_g(*oneflow_args)
elif oneflow.__name__ in ignore_apis_list:
find_check_module_func = False
Expand Down Expand Up @@ -379,8 +379,29 @@ def build(self):
test_g_res = oneflow_res
else:
pass
if verbose:
print(
"Run graph of function: ",
repr(oneflow),
", graph check is intentionally skiped.",
)
elif oneflow.__name__ == "Parameter":
# nn.Graph donot deal with Parameter creation.
test_g_res = oneflow_res
if verbose:
print(
"Run graph of function: ",
repr(oneflow),
", graph check is intentionally skiped.",
)
else:
test_g = TestGraphOfFunctional()
if verbose:
print(
"Run graph of function: ",
repr(oneflow),
)
test_g.debug(3)
test_g_res = test_g()
except Exception as e:
print_note_fake_program()
Expand Down Expand Up @@ -439,6 +460,9 @@ def build(self):

try:
test_g = TestGraphOfTensorMethod()
if verbose:
print("Run graph of method: ", repr(oneflow))
test_g.debug(3)
test_g_res = test_g()
except Exception as e:
print_note_fake_program()
Expand Down Expand Up @@ -676,6 +700,10 @@ def autotest(
):
verbose = os.getenv("ONEFLOW_TEST_VERBOSE") is not None

if check_graph == "ValidatedFlase":
# check graph is intentionally closed and threre is a validated reason.
check_graph = False

def deco(f):
@functools.wraps(f)
def new_f(test_case):
Expand Down