Passing tests after merge issues

flexflow · Feb 24, 2025 · 350babf · 350babf
1 parent 9047edc
commit 350babf
Show file tree

Hide file tree

Showing 12 changed files with 247 additions and 200 deletions.
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
@@ -54,8 +54,7 @@ bool are_allocated_gradient_tensors_valid(
   for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
        allocated_tensors.gradient_mapping) {
     if (tensor_attrs.count(tensor_to_grad.first)) {
-      if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
-          CreateGrad::NO) {
+      if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) {
         return false;
       }
 
@@ -96,7 +95,7 @@ bool are_allocated_optimizer_tensors_valid(
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
            &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
     if (tensor_attrs.count(tensor_to_optimizers.first)) {
-      if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
+      if (tensor_attrs.at(tensor_to_optimizers.first).create_grad ==
           CreateGrad::NO) {
         return false;
       }

diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
@@ -4,13 +4,13 @@
 #include "local-execution/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
-#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/computation_graph.h"
+#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/machine_view.dtg.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/concat_vectors.h"
+#include "utils/containers/get_only.h"
 #include "utils/containers/sum.h"
-#include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/values.h"
 
@@ -26,41 +26,36 @@ static ComputationGraph create_computation_graph_for_local_cost_estimation(
     std::vector<ParallelTensorAttrs> const &outputs) {
   ComputationGraph computation_graph = make_empty_computation_graph();
 
-  // create layer for inputs
-  auto get_vector_piece_attrs_from_parallel_tensor_shape =
-      [](std::vector<ParallelTensorShape> const &parallel_shapes) {
-        return transform(parallel_shapes, [](ParallelTensorShape const &p) {
-          return TensorAttrs{
-              get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES};
-        });
-      };
-
-  LayerAddedResult inputs_layer =
-      add_layer(computation_graph,
-                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
-                {},
-                get_vector_piece_attrs_from_parallel_tensor_shape(inputs));
-
-  // create layer for weights
-  auto get_vector_piece_attrs_from_parallel_tensor_attrs =
-      [](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
-        return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
-          return get_piece_attrs(p);
-        });
-      };
-
-  LayerAddedResult weights_layer =
-      add_layer(computation_graph,
-                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"},
-                {},
-                get_vector_piece_attrs_from_parallel_tensor_attrs(weights));
+  std::vector<tensor_guid_t> input_tensors;
+  for (ParallelTensorShape const &input : inputs) {
+    LayerAddedResult inputs_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
+                   std::nullopt},
+        {},
+        {});
+    input_tensors.push_back(get_only(inputs_layer.outputs));
+  }
+
+  std::vector<tensor_guid_t> weight_tensors;
+  for (ParallelTensorAttrs const &weight : weights) {
+    LayerAddedResult weights_layer =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                                 get_piece_shape(weight.shape),
+                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
+                             std::nullopt},
+                  {},
+                  {});
+    weight_tensors.push_back(get_only(weights_layer.outputs));
+  }
 
   // create operator layer
   LayerAddedResult operator_layer = add_layer(
       computation_graph,
       LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
-      concat_vectors(inputs_layer.outputs, weights_layer.outputs),
-      get_vector_piece_attrs_from_parallel_tensor_attrs(outputs));
+      input_tensors,
+      weight_tensors);
 
   return computation_graph;
 }

diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
@@ -213,7 +213,7 @@ void execute_update(LocalTrainingBacking const &local_training_backing,
                     Allocator &allocator) {
   LayerAttrs layer_attrs =
       get_layer_attrs(local_training_backing.computation_graph, node);
-  if (layer_attrs.attrs.has<WeightAttrs>()) {
+  if (layer_attrs.op_attrs.has<WeightAttrs>()) {
     // get tensors
     tensor_guid_t weight_tensor = get_only(
         get_outgoing_tensors(local_training_backing.computation_graph, node));

diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
@@ -1,6 +1,7 @@
 #include "local-execution/optimizer.h"
 #include "kernels/optimizer_kernels.h"
 #include "task-spec/profiling.h"
+#include "utils/containers/get_only.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -24,9 +25,12 @@ TaskSignature get_sgd_update_signature() {
 
   add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
-  }
+  add_unchecked_arg_slot<PerDeviceFFHandle>(
+      sig, HANDLE); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  // }
   return sig;
 }
 
@@ -44,12 +48,16 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    b.bind_arg(HANDLE, ff_handle());
-    return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
-  } else {
-    return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
-  }
+  b.bind_arg(HANDLE, ff_handle());
+  return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID,
+                        b}; // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   b.bind_arg(HANDLE, ff_handle());
+  //   return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
+  // } else {
+  //   return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
+  // }
 }
 
 static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
@@ -73,35 +81,49 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-    profile(sgd_nccl_update_task_gpu,
-            profiling,
-            "[SGD NCCL] update_time = %.2lfms\n",
-            attrs.lr,
-            attrs.momentum,
-            attrs.nesterov,
-            attrs.weight_decay,
-            handle,
-            weight_grad.get_float_ptr(),
-            size,
-            weight.get_float_ptr(),
-            sgd_v_ptr);
-
-  } else {
-    profile(sgd_ps_update_task_gpu,
-            profiling,
-            "[SGD PS] update_time = %.2lfms\n",
-            attrs.lr,
-            attrs.momentum,
-            attrs.nesterov,
-            attrs.weight_decay,
-            weight_grad.get_float_ptr(),
-            size,
-            num_replicas,
-            weight.get_float_ptr(),
-            sgd_v_ptr);
-  }
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  profile(sgd_nccl_update_task_gpu,
+          profiling,
+          "[SGD NCCL] update_time = %.2lfms\n",
+          attrs.lr,
+          attrs.momentum,
+          attrs.nesterov,
+          attrs.weight_decay,
+          handle,
+          weight_grad.get_float_ptr(),
+          size,
+          weight.get_float_ptr(),
+          sgd_v_ptr); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  //   profile(sgd_nccl_update_task_gpu,
+  //           profiling,
+  //           "[SGD NCCL] update_time = %.2lfms\n",
+  //           attrs.lr,
+  //           attrs.momentum,
+  //           attrs.nesterov,
+  //           attrs.weight_decay,
+  //           handle,
+  //           weight_grad.get_float_ptr(),
+  //           size,
+  //           weight.get_float_ptr(),
+  //           sgd_v_ptr);
+
+  // } else {
+  //   profile(sgd_ps_update_task_gpu,
+  //           profiling,
+  //           "[SGD PS] update_time = %.2lfms\n",
+  //           attrs.lr,
+  //           attrs.momentum,
+  //           attrs.nesterov,
+  //           attrs.weight_decay,
+  //           weight_grad.get_float_ptr(),
+  //           size,
+  //           num_replicas,
+  //           weight.get_float_ptr(),
+  //           sgd_v_ptr);
+  // }
 }
 
 TaskImplFunction get_sgd_update_task_impl() {
@@ -117,9 +139,11 @@ TaskSignature get_adam_update_signature() {
 
   add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
-  }
+  add_unchecked_arg_slot<PerDeviceFFHandle>(
+      sig, HANDLE); // how to deal with removal of ParamSync?
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  // }
   return sig;
 }
 
@@ -135,13 +159,16 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
   b.bind_optimizer(ADAM_V, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(HANDLE, ff_handle());
+  return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID,
+                        b}; // how to deal with removal of ParamSync?
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    b.bind_arg(HANDLE, ff_handle());
-    return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
-  } else {
-    return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
-  }
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   b.bind_arg(HANDLE, ff_handle());
+  //   return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
+  // } else {
+  //   return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
+  // }
 }
 
 static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
@@ -162,38 +189,54 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
                      weight.shape.get_volume().unwrap_nonnegative();
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-    profile(adam_nccl_update_task_gpu,
-            profiling,
-            "[Adam NCCL] update_time = %.2lfms\n",
-            attrs.alpha_t,
-            attrs.beta1,
-            attrs.beta2,
-            attrs.weight_decay,
-            attrs.epsilon,
-            size,
-            handle,
-            weight_grad.get_float_ptr(),
-            m_tensor.get_float_ptr(),
-            v_tensor.get_float_ptr(),
-            weight.get_float_ptr());
-  } else {
-    profile(adam_ps_update_task_gpu,
-            profiling,
-            "[Adam NCCL] update_time = %.2lfms\n",
-            attrs.alpha_t,
-            attrs.beta1,
-            attrs.beta2,
-            attrs.weight_decay,
-            attrs.epsilon,
-            size,
-            num_replicas,
-            weight_grad.get_float_ptr(),
-            m_tensor.get_float_ptr(),
-            v_tensor.get_float_ptr(),
-            weight.get_float_ptr());
-  }
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  profile(adam_nccl_update_task_gpu,
+          profiling,
+          "[Adam NCCL] update_time = %.2lfms\n",
+          attrs.alpha_t,
+          attrs.beta1,
+          attrs.beta2,
+          attrs.weight_decay,
+          attrs.epsilon,
+          size,
+          handle,
+          weight_grad.get_float_ptr(),
+          m_tensor.get_float_ptr(),
+          v_tensor.get_float_ptr(),
+          weight.get_float_ptr()); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  //   profile(adam_nccl_update_task_gpu,
+  //           profiling,
+  //           "[Adam NCCL] update_time = %.2lfms\n",
+  //           attrs.alpha_t,
+  //           attrs.beta1,
+  //           attrs.beta2,
+  //           attrs.weight_decay,
+  //           attrs.epsilon,
+  //           size,
+  //           handle,
+  //           weight_grad.get_float_ptr(),
+  //           m_tensor.get_float_ptr(),
+  //           v_tensor.get_float_ptr(),
+  //           weight.get_float_ptr());
+  // } else {
+  //   profile(adam_ps_update_task_gpu,
+  //           profiling,
+  //           "[Adam NCCL] update_time = %.2lfms\n",
+  //           attrs.alpha_t,
+  //           attrs.beta1,
+  //           attrs.beta2,
+  //           attrs.weight_decay,
+  //           attrs.epsilon,
+  //           size,
+  //           num_replicas,
+  //           weight_grad.get_float_ptr(),
+  //           m_tensor.get_float_ptr(),
+  //           v_tensor.get_float_ptr(),
+  //           weight.get_float_ptr());
+  // }
 }
 
 TaskImplFunction get_adam_update_task_impl() {
@@ -211,17 +254,18 @@ TaskInvocation get_update_invocation(
     tensor_guid_t const &weight,
     gradient_tensor_t const &weight_grad,
     std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
-  return attrs.visit<TaskInvocation>(overload{
-      [&](SGDOptimizerAttrs const &s) {
-        return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0));
-      },
-      [&](AdamOptimizerAttrs const &s) {
-        return adam_update(s,
-                           weight,
-                           weight_grad,
-                           grad_buffer_tensors.at(0),
-                           grad_buffer_tensors.at(1));
-      }});
+  return attrs.visit<TaskInvocation>(
+      overload{[&](SGDOptimizerAttrs const &s) {
+                 return sgd_update(
+                     s, weight, weight_grad, get_only(grad_buffer_tensors));
+               },
+               [&](AdamOptimizerAttrs const &s) {
+                 return adam_update(s,
+                                    weight,
+                                    weight_grad,
+                                    grad_buffer_tensors.at(0),
+                                    grad_buffer_tensors.at(1));
+               }});
 }
 
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {