Skip to content

Commit

Permalink
Passing tests after merge issues
Browse files Browse the repository at this point in the history
  • Loading branch information
reyna-abhyankar committed Feb 24, 2025
1 parent 9047edc commit 350babf
Show file tree
Hide file tree
Showing 12 changed files with 247 additions and 200 deletions.
5 changes: 2 additions & 3 deletions lib/local-execution/src/allocated_tensors.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ bool are_allocated_gradient_tensors_valid(
for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
allocated_tensors.gradient_mapping) {
if (tensor_attrs.count(tensor_to_grad.first)) {
if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
CreateGrad::NO) {
if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) {
return false;
}

Expand Down Expand Up @@ -96,7 +95,7 @@ bool are_allocated_optimizer_tensors_valid(
for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
&tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
if (tensor_attrs.count(tensor_to_optimizers.first)) {
if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
if (tensor_attrs.at(tensor_to_optimizers.first).create_grad ==
CreateGrad::NO) {
return false;
}
Expand Down
59 changes: 27 additions & 32 deletions lib/local-execution/src/local_cost_estimator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
#include "local-execution/tracked_allocator.h"
#include "op-attrs/computation_graph_op_attrs.h"
#include "op-attrs/pcg_operator_attrs.h"
#include "pcg/computation_graph/layer_added_result.dtg.h"
#include "pcg/computation_graph.h"
#include "pcg/computation_graph/layer_added_result.dtg.h"
#include "pcg/machine_view.dtg.h"
#include "pcg/parallel_tensor_attrs.h"
#include "utils/containers/concat_vectors.h"
#include "utils/containers/get_only.h"
#include "utils/containers/sum.h"
#include "pcg/parallel_tensor_attrs.h"
#include "utils/containers/transform.h"
#include "utils/containers/values.h"

Expand All @@ -26,41 +26,36 @@ static ComputationGraph create_computation_graph_for_local_cost_estimation(
std::vector<ParallelTensorAttrs> const &outputs) {
ComputationGraph computation_graph = make_empty_computation_graph();

Check warning on line 27 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L27

Added line #L27 was not covered by tests

// create layer for inputs
auto get_vector_piece_attrs_from_parallel_tensor_shape =
[](std::vector<ParallelTensorShape> const &parallel_shapes) {
return transform(parallel_shapes, [](ParallelTensorShape const &p) {
return TensorAttrs{
get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES};
});
};

LayerAddedResult inputs_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
{},
get_vector_piece_attrs_from_parallel_tensor_shape(inputs));

// create layer for weights
auto get_vector_piece_attrs_from_parallel_tensor_attrs =
[](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
return get_piece_attrs(p);
});
};

LayerAddedResult weights_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"},
{},
get_vector_piece_attrs_from_parallel_tensor_attrs(weights));
std::vector<tensor_guid_t> input_tensors;
for (ParallelTensorShape const &input : inputs) {

Check warning on line 30 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L29-L30

Added lines #L29 - L30 were not covered by tests
LayerAddedResult inputs_layer = add_layer(
computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},

Check warning on line 33 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L33

Added line #L33 was not covered by tests
std::nullopt},
{},
{});
input_tensors.push_back(get_only(inputs_layer.outputs));
}

Check warning on line 38 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L36-L38

Added lines #L36 - L38 were not covered by tests

std::vector<tensor_guid_t> weight_tensors;
for (ParallelTensorAttrs const &weight : weights) {

Check warning on line 41 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L40-L41

Added lines #L40 - L41 were not covered by tests
LayerAddedResult weights_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
get_piece_shape(weight.shape),
InitializerAttrs{ZeroInitializerAttrs{}}}},

Check warning on line 46 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L44-L46

Added lines #L44 - L46 were not covered by tests
std::nullopt},
{},
{});
weight_tensors.push_back(get_only(weights_layer.outputs));
}

Check warning on line 51 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L49-L51

Added lines #L49 - L51 were not covered by tests

// create operator layer
LayerAddedResult operator_layer = add_layer(
computation_graph,
LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},

Check warning on line 56 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L56

Added line #L56 was not covered by tests
concat_vectors(inputs_layer.outputs, weights_layer.outputs),
get_vector_piece_attrs_from_parallel_tensor_attrs(outputs));
input_tensors,
weight_tensors);

Check warning on line 58 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L58

Added line #L58 was not covered by tests

return computation_graph;
}

Check warning on line 61 in lib/local-execution/src/local_cost_estimator.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_cost_estimator.cc#L60-L61

Added lines #L60 - L61 were not covered by tests
Expand Down
2 changes: 1 addition & 1 deletion lib/local-execution/src/local_training_backing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ void execute_update(LocalTrainingBacking const &local_training_backing,
Allocator &allocator) {
LayerAttrs layer_attrs =
get_layer_attrs(local_training_backing.computation_graph, node);
if (layer_attrs.attrs.has<WeightAttrs>()) {
if (layer_attrs.op_attrs.has<WeightAttrs>()) {

Check warning on line 216 in lib/local-execution/src/local_training_backing.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_training_backing.cc#L215-L216

Added lines #L215 - L216 were not covered by tests
// get tensors
tensor_guid_t weight_tensor = get_only(
get_outgoing_tensors(local_training_backing.computation_graph, node));

Check warning on line 219 in lib/local-execution/src/local_training_backing.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/local_training_backing.cc#L218-L219

Added lines #L218 - L219 were not covered by tests
Expand Down
224 changes: 134 additions & 90 deletions lib/local-execution/src/optimizer.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "local-execution/optimizer.h"
#include "kernels/optimizer_kernels.h"
#include "task-spec/profiling.h"
#include "utils/containers/get_only.h"
#include "utils/overload.h"

namespace FlexFlow {
Expand All @@ -24,9 +25,12 @@ TaskSignature get_sgd_update_signature() {

add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
add_arg_slot<ProfilingSettings>(sig, PROFILING);
if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
}
add_unchecked_arg_slot<PerDeviceFFHandle>(

Check warning on line 28 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L26-L28

Added lines #L26 - L28 were not covered by tests
sig, HANDLE); // how to deal with removal of ParamSync?

// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
// }
return sig;
}

Check warning on line 35 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L34-L35

Added lines #L34 - L35 were not covered by tests

Expand All @@ -44,12 +48,16 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
b.bind_arg(ATTRS, attrs);
b.bind_arg(PROFILING, profiling_settings());

Check warning on line 49 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L48-L49

Added lines #L48 - L49 were not covered by tests

if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
b.bind_arg(HANDLE, ff_handle());
return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
} else {
return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
}
b.bind_arg(HANDLE, ff_handle());
return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID,
b}; // how to deal with removal of ParamSync?

Check warning on line 53 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L51-L53

Added lines #L51 - L53 were not covered by tests

// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// b.bind_arg(HANDLE, ff_handle());
// return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
// } else {
// return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
// }
}

Check warning on line 61 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L61

Added line #L61 was not covered by tests

static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
Expand All @@ -73,35 +81,49 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
sgd_v_ptr = sgd_v.get_float_ptr();

Check warning on line 81 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L78-L81

Added lines #L78 - L81 were not covered by tests
}

if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
profile(sgd_nccl_update_task_gpu,
profiling,
"[SGD NCCL] update_time = %.2lfms\n",
attrs.lr,
attrs.momentum,
attrs.nesterov,
attrs.weight_decay,
handle,
weight_grad.get_float_ptr(),
size,
weight.get_float_ptr(),
sgd_v_ptr);

} else {
profile(sgd_ps_update_task_gpu,
profiling,
"[SGD PS] update_time = %.2lfms\n",
attrs.lr,
attrs.momentum,
attrs.nesterov,
attrs.weight_decay,
weight_grad.get_float_ptr(),
size,
num_replicas,
weight.get_float_ptr(),
sgd_v_ptr);
}
auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
profile(sgd_nccl_update_task_gpu,

Check warning on line 85 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L84-L85

Added lines #L84 - L85 were not covered by tests
profiling,
"[SGD NCCL] update_time = %.2lfms\n",
attrs.lr,
attrs.momentum,
attrs.nesterov,
attrs.weight_decay,
handle,
weight_grad.get_float_ptr(),

Check warning on line 93 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L93

Added line #L93 was not covered by tests
size,
weight.get_float_ptr(),

Check warning on line 95 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L95

Added line #L95 was not covered by tests
sgd_v_ptr); // how to deal with removal of ParamSync?

// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
// profile(sgd_nccl_update_task_gpu,
// profiling,
// "[SGD NCCL] update_time = %.2lfms\n",
// attrs.lr,
// attrs.momentum,
// attrs.nesterov,
// attrs.weight_decay,
// handle,
// weight_grad.get_float_ptr(),
// size,
// weight.get_float_ptr(),
// sgd_v_ptr);

// } else {
// profile(sgd_ps_update_task_gpu,
// profiling,
// "[SGD PS] update_time = %.2lfms\n",
// attrs.lr,
// attrs.momentum,
// attrs.nesterov,
// attrs.weight_decay,
// weight_grad.get_float_ptr(),
// size,
// num_replicas,
// weight.get_float_ptr(),
// sgd_v_ptr);
// }
}

Check warning on line 127 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L127

Added line #L127 was not covered by tests

TaskImplFunction get_sgd_update_task_impl() {
Expand All @@ -117,9 +139,11 @@ TaskSignature get_adam_update_signature() {

add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
add_arg_slot<ProfilingSettings>(sig, PROFILING);
if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
}
add_unchecked_arg_slot<PerDeviceFFHandle>(

Check warning on line 142 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L140-L142

Added lines #L140 - L142 were not covered by tests
sig, HANDLE); // how to deal with removal of ParamSync?
// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
// }
return sig;
}

Check warning on line 148 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L147-L148

Added lines #L147 - L148 were not covered by tests

Expand All @@ -135,13 +159,16 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
b.bind_optimizer(ADAM_V, adam_v);
b.bind_arg(ATTRS, attrs);
b.bind_arg(PROFILING, profiling_settings());
b.bind_arg(HANDLE, ff_handle());
return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID,
b}; // how to deal with removal of ParamSync?

Check warning on line 164 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L155-L164

Added lines #L155 - L164 were not covered by tests

if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
b.bind_arg(HANDLE, ff_handle());
return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
} else {
return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
}
// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// b.bind_arg(HANDLE, ff_handle());
// return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
// } else {
// return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
// }
}

Check warning on line 172 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L172

Added line #L172 was not covered by tests

static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
Expand All @@ -162,38 +189,54 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
weight.shape.get_volume().unwrap_nonnegative();

Check warning on line 190 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L189-L190

Added lines #L189 - L190 were not covered by tests

if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
profile(adam_nccl_update_task_gpu,
profiling,
"[Adam NCCL] update_time = %.2lfms\n",
attrs.alpha_t,
attrs.beta1,
attrs.beta2,
attrs.weight_decay,
attrs.epsilon,
size,
handle,
weight_grad.get_float_ptr(),
m_tensor.get_float_ptr(),
v_tensor.get_float_ptr(),
weight.get_float_ptr());
} else {
profile(adam_ps_update_task_gpu,
profiling,
"[Adam NCCL] update_time = %.2lfms\n",
attrs.alpha_t,
attrs.beta1,
attrs.beta2,
attrs.weight_decay,
attrs.epsilon,
size,
num_replicas,
weight_grad.get_float_ptr(),
m_tensor.get_float_ptr(),
v_tensor.get_float_ptr(),
weight.get_float_ptr());
}
auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
profile(adam_nccl_update_task_gpu,

Check warning on line 193 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L192-L193

Added lines #L192 - L193 were not covered by tests
profiling,
"[Adam NCCL] update_time = %.2lfms\n",
attrs.alpha_t,
attrs.beta1,
attrs.beta2,
attrs.weight_decay,
attrs.epsilon,
size,
handle,
weight_grad.get_float_ptr(),
m_tensor.get_float_ptr(),
v_tensor.get_float_ptr(),
weight.get_float_ptr()); // how to deal with removal of ParamSync?

Check warning on line 206 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L203-L206

Added lines #L203 - L206 were not covered by tests

// if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
// auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
// profile(adam_nccl_update_task_gpu,
// profiling,
// "[Adam NCCL] update_time = %.2lfms\n",
// attrs.alpha_t,
// attrs.beta1,
// attrs.beta2,
// attrs.weight_decay,
// attrs.epsilon,
// size,
// handle,
// weight_grad.get_float_ptr(),
// m_tensor.get_float_ptr(),
// v_tensor.get_float_ptr(),
// weight.get_float_ptr());
// } else {
// profile(adam_ps_update_task_gpu,
// profiling,
// "[Adam NCCL] update_time = %.2lfms\n",
// attrs.alpha_t,
// attrs.beta1,
// attrs.beta2,
// attrs.weight_decay,
// attrs.epsilon,
// size,
// num_replicas,
// weight_grad.get_float_ptr(),
// m_tensor.get_float_ptr(),
// v_tensor.get_float_ptr(),
// weight.get_float_ptr());
// }
}

Check warning on line 240 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L240

Added line #L240 was not covered by tests

TaskImplFunction get_adam_update_task_impl() {
Expand All @@ -211,17 +254,18 @@ TaskInvocation get_update_invocation(
tensor_guid_t const &weight,
gradient_tensor_t const &weight_grad,
std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
return attrs.visit<TaskInvocation>(overload{
[&](SGDOptimizerAttrs const &s) {
return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0));
},
[&](AdamOptimizerAttrs const &s) {
return adam_update(s,
weight,
weight_grad,
grad_buffer_tensors.at(0),
grad_buffer_tensors.at(1));
}});
return attrs.visit<TaskInvocation>(
overload{[&](SGDOptimizerAttrs const &s) {

Check warning on line 258 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L258

Added line #L258 was not covered by tests
return sgd_update(
s, weight, weight_grad, get_only(grad_buffer_tensors));

Check warning on line 260 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L260

Added line #L260 was not covered by tests
},
[&](AdamOptimizerAttrs const &s) {

Check warning on line 262 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L262

Added line #L262 was not covered by tests
return adam_update(s,
weight,
weight_grad,
grad_buffer_tensors.at(0),
grad_buffer_tensors.at(1));
}});

Check warning on line 268 in lib/local-execution/src/optimizer.cc

View check run for this annotation

Codecov / codecov/patch

lib/local-execution/src/optimizer.cc#L264-L268

Added lines #L264 - L268 were not covered by tests
}

TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {
Expand Down
Loading

0 comments on commit 350babf

Please sign in to comment.