Skip to content

Commit

Permalink
bugs fix for before_idx
Browse files Browse the repository at this point in the history
  • Loading branch information
FeixLiu committed Aug 11, 2021
1 parent 1b12a43 commit abbae2e
Showing 1 changed file with 52 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -240,25 +240,8 @@ def _allreduce_fusion_program(self):
continue
param_grads.append((param, grad))

# Each item of outputs_name_to_idx is a pair of idx
# The first entry of this pair is the idx of the first op generates the grad
# which is used to indicate the position to insert coalesce op
# The second entry of this pair is the idx of the last op generates the grad
# which is used to indicate teh position to insert sync and allreduce op
outputs_name_to_idx = {}
for idx in range(first_backward_idx, len(block.ops)):
op = block.ops[idx]
if is_optimizer_op(op):
break
for name in op.output_arg_names:
var = block.var(name)
if not outputs_name_to_idx.get(var):
# if the grad only be generated by one op
# the first idx and the last ids are identical
outputs_name_to_idx[var] = (idx, idx)
else:
outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
idx)
outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
block)

# structure of grad_param_segments is
# [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
Expand All @@ -280,6 +263,7 @@ def _allreduce_fusion_program(self):
if len(grad_param_segments) == 0:
return

fused_vars = [None] * len(grad_param_segments)
for i in range(len(grad_param_segments) - 1, -1, -1):
# travers the grad_param_segments in backward
# not to use reversed since needs the absolute index value
Expand All @@ -291,25 +275,10 @@ def _allreduce_fusion_program(self):
dtype=grad_segment[0].dtype,
persistable=False,
stop_gradient=True)
before_idx = outputs_name_to_idx[grad_segment[0]][0]
fused_vars[i] = fused_var
after_idx = outputs_name_to_idx[grad_segment[-1]][1]
offset = 1
for j in range(i + 1, len(grad_param_segments)):
# Find the offset of the sync op and allreduce op
# Some ops may have multi grad_param pairs, and these grads might be
# split into different segments. If the last grad in this segment and
# the first grad in next segment are from the same op, it means
# a coalesce op has already been inserted before this op.
# Therefore, we have to insert the the sync/allreduce op with offset.
# The j is to get the ([grad0, grad1], [param0, param1]) tuple
# The first 0 is to get [grad0, grad1] list
# The second 0 is to get grad0 entry
# The 1 is to get the idx of the last op generates the grad
if after_idx == outputs_name_to_idx[grad_param_segments[j][0][
0]][1]:
offset += 1
block._insert_op_without_sync(
after_idx + offset,
after_idx + 1,
type='c_allreduce_sum',
inputs={'X': fused_var},
outputs={'Out': fused_var},
Expand All @@ -320,11 +289,35 @@ def _allreduce_fusion_program(self):
})
if not self.calc_comm_same_stream:
block._insert_op_without_sync(
after_idx + offset,
after_idx + 1,
type='c_sync_calc_stream',
inputs={'X': fused_var},
outputs={'Out': fused_var},
attrs={OP_ROLE_KEY: OpRole.Backward})

# update the outputs_name_to_idx after insertion of sync/allreduce ops
outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
block)
# the before_idx is not guaranteed sorted, therefore we have to find the
# topology to insert the coalesce ops
pos_for_coalesce = {}
for i in range(len(grad_param_segments) - 1, -1, -1):
# We separate the insertion of coalesce op and the insertion of sync/allreduce op,
# since that the coalesce op's insertion may invalidate the outputs_name_to_idx
grad_segment, param_segment = grad_param_segments[i]
before_idx = len(block.ops)
for grad in outputs_name_to_idx:
before_idx = min(before_idx, outputs_name_to_idx[grad][0])
pos_for_coalesce[i] = before_idx

# insert the coalesce op based on the sorted before_idx
pos_for_coalesce = sorted(
pos_for_coalesce.items(),
key=lambda kv: (kv[1], kv[0]),
reverse=True)
for i, before_idx in pos_for_coalesce:
grad_segment, param_segment = grad_param_segments[i]
fused_var = fused_vars[i]
block._insert_op_without_sync(
before_idx,
type="coalesce_tensor",
Expand Down Expand Up @@ -354,3 +347,25 @@ def _allreduce_fusion_program(self):
OP_ROLE_KEY: OpRole.Backward})
break
block._sync_with_cpp()

def __get_ouputs_name_to_idx(self, first_backward_idx, block):
# Each item of outputs_name_to_idx is a pair of idx.
# The first entry of this pair is the idx of the first op generates the grad,
# which is used to indicate the position to insert coalesce op.
# The second entry of this pair is the idx of the last op generates the grad,
# which is used to indicate the position to insert sync and allreduce op.
outputs_name_to_idx = {}
for idx in range(first_backward_idx, len(block.ops)):
op = block.ops[idx]
if is_optimizer_op(op):
break
for name in op.output_arg_names:
var = block.var(name)
if not outputs_name_to_idx.get(var):
# if the grad only be generated by one op
# the first idx and the last ids are identical
outputs_name_to_idx[var] = (idx, idx)
else:
outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
idx)
return outputs_name_to_idx

1 comment on commit abbae2e

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.