flexflow · elliottslaughter · Jan 31, 2026 · Feb 6, 2026 · Feb 12, 2026
diff --git a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
@@ -1,19 +1,24 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H
 #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_interconnect_specification.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
 
 namespace FlexFlow {
 
 struct LocalCostEstimator : public ICostEstimator {
-  explicit LocalCostEstimator(RuntimeArgConfig const &,
-                              MachineInterconnectSpecification const &,
-                              DeviceType);
+  explicit LocalCostEstimator(
+      MachineInterconnectSpecification const &interconnect_specification,
+      Allocator &allocator,
+      ProfilingSettings const &profiling_settings,
+      device_handle_t const &device_handle,
+      FFIterationConfig const &iteration_config,
+      device_id_t device_idx);
 
   LocalCostEstimator(LocalCostEstimator const &) = delete;
   LocalCostEstimator(LocalCostEstimator &&) = delete;
@@ -24,16 +29,23 @@ struct LocalCostEstimator : public ICostEstimator {
   milliseconds_t estimate_cost(TensorSetMovement const &) const override;
 
 private:
-  RuntimeArgConfig runtime_arg_config;
   MachineInterconnectSpecification interconnect_specification;
-  DeviceType device_type;
+  Allocator allocator;
+  ProfilingSettings profiling_settings;
+  device_handle_t device_handle;
+  FFIterationConfig iteration_config;
+  device_id_t device_idx;
 };
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCostEstimator);
 
-CostEstimator get_local_cost_estimator(RuntimeArgConfig const &);
+CostEstimator get_local_cost_estimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx);
 
 } // namespace FlexFlow
 
 #endif
-
-#endif
diff --git a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h
@@ -1,5 +1,3 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 
@@ -35,5 +33,3 @@ size_t get_tracked_memory_usage(Allocator &wrapped_allocator);
 } // namespace FlexFlow
 
 #endif
-
-#endif
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -27,6 +27,8 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;
 
+  TensorShape get_tensor_shape(TensorSlotName slot) const override;
+
   GenericTensorAccessor get_tensor(TaskTensorParameter slot,
                                    Permissions priv) const override;
 

diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
@@ -1,60 +1,75 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #include "local-execution/cost_estimator/local_cost_estimator.h"
 #include "compiler/machine_mapping/machine_view.dtg.h"
 #include "kernels/create_local_allocator_for_device_type.h"
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
+#include "local-execution/computation_graph_instance/computation_graph_instance.h"
 #include "local-execution/cost_estimator/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph/layer_added_result.dtg.h"
+#include "pcg/device_id.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/concat_vectors.h"
-#include "utils/containers/get_only.h"
+#include "utils/containers/map_values.h"
 #include "utils/containers/maximum.h"
+#include "utils/containers/require_only_key.h"
 #include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
+#include "utils/exception.h"
+#include "utils/optional.h"
+#include <optional>
 
 namespace FlexFlow {
 
-LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
-    : runtime_arg_config(config) {}
+LocalCostEstimator::LocalCostEstimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx)
+    : interconnect_specification(interconnect_specification),
+      allocator(allocator), profiling_settings(profiling_settings),
+      device_handle(device_handle), iteration_config(iteration_config),
+      device_idx(device_idx) {}
 
 static ComputationGraph computation_graph_for_local_cost_estimation(
     ComputationGraphOpAttrs const &op,
-    std::vector<ParallelTensorShape> const &inputs,
-    std::vector<ParallelTensorShape> const &weights,
-    std::vector<ParallelTensorShape> const &outputs) {
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &inputs,
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &weights,
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &outputs) {
   ComputationGraph computation_graph = make_empty_computation_graph();
 
-  std::vector<tensor_guid_t> input_tensors;
-  for (ParallelTensorShape const &input : inputs) {
-    LayerAddedResult inputs_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
-                   std::nullopt},
-        {},
-        {});
-    input_tensors.push_back(get_only(inputs_layer.outputs));
-  }
-
-  std::vector<tensor_guid_t> weight_tensors;
-  for (ParallelTensorShape const &weight : weights) {
-    LayerAddedResult weights_layer =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                                 get_piece_shape(weight),
-                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
-                             std::nullopt},
-                  {},
-                  {});
-    weight_tensors.push_back(get_only(weights_layer.outputs));
-  }
+  std::unordered_map<TensorSlotName, tensor_guid_t> input_tensors =
+      map_values(inputs, [&](ParallelTensorShape const &shape) {
+        LayerAddedResult inputs_layer =
+            add_layer(computation_graph,
+                      LayerAttrs{ComputationGraphOpAttrs{
+                                     InputAttrs{get_piece_shape(shape)}},
+                                 std::nullopt},
+                      {},
+                      {});
+        return require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+      });
+
+  std::unordered_map<TensorSlotName, tensor_guid_t> weight_tensors =
+      map_values(weights, [&](ParallelTensorShape const &shape) {
+        LayerAddedResult weights_layer =
+            add_layer(computation_graph,
+                      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                                     get_piece_shape(shape),
+                                     InitializerAttrs{ZeroInitializerAttrs{}}}},
+                                 std::nullopt},
+                      {},
+                      {});
+        return require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
+      });
 
   // create operator layer
   LayerAddedResult operator_layer = add_layer(computation_graph,
@@ -72,10 +87,13 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
     OpCostEstimateKey const &op_cost_estimate_key) const {
 
   PCGOperatorAttrs op = op_cost_estimate_key.op_attrs;
-  std::vector<ParallelTensorShape> inputs = op_cost_estimate_key.input_shapes;
-  std::vector<ParallelTensorShape> weights = op_cost_estimate_key.weight_shapes;
-  std::vector<ParallelTensorShape> outputs = op_cost_estimate_key.output_shapes;
-  MachineView mv = op_cost_estimate_key.machine_view;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> inputs =
+      op_cost_estimate_key.input_shapes;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> weights =
+      op_cost_estimate_key.weight_shapes;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> outputs =
+      op_cost_estimate_key.output_shapes;
+  OptimizerAttrs optimizer_attrs = op_cost_estimate_key.optimizer_attrs;
 
   if (is_parallel_op(op) || op.has<InputAttrs>() || op.has<NoopAttrs>() ||
       op.has<WeightAttrs>()) {
@@ -89,30 +107,50 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
   // allocate memory
   std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
       std::make_shared<TrackedAllocator>(create_local_allocator_for_device_type(
-          runtime_arg_config.kernel_device_type));
+          get_device_type(this->device_idx)));
 
   layer_guid_t layer_guid = layer_guid_t{Node{0}};
 
   Allocator allocator = Allocator(tracked_allocator_ptr);
 
+  ComputationGraph cg = computation_graph_for_local_cost_estimation(
+      /*op=*/assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs(op)),
+      /*inputs=*/inputs,
+      /*weights=*/weights,
+      /*outputs=*/outputs);
+
+  ComputationGraphInstance instance = create_computation_graph_instance(
+      /*compgraph=*/cg,
+      /*optimizer_attrs=*/optimizer_attrs,
+      /*loss_attrs=*/std::nullopt,
+      /*label_tensor=*/std::nullopt,
+      /*logit_tensor=*/std::nullopt,
+      /*input_tensors=*/{},
+      /*allocator=*/allocator,
+      /*profiling_settings=*/this->profiling_settings,
+      /*device_handle=*/this->device_handle,
+      /*iteration_config=*/this->iteration_config,
+      /*device_idx=*/this->device_idx);
+
   // execute layer
-  layer_guid_t operator_layer_guid =
-      get_layer_by_name(training_cg.computation_graph, "operator");
-
-  milliseconds_t fwd = execute_forward(local_backing.local_task_registry,
-                                       local_backing.local_tensor_backing,
-                                       local_backing.local_args_backing,
-                                       get_training_layer_plus_context(
-                                           training_cg, operator_layer_guid),
-                                       allocator)
-                           .value();
-  milliseconds_t bwd = execute_backward(local_backing.local_task_registry,
-                                        local_backing.local_tensor_backing,
-                                        local_backing.local_args_backing,
-                                        get_training_layer_plus_context(
-                                            training_cg, operator_layer_guid),
-                                        allocator)
-                           .value();
+  dynamic_layer_guid_t operator_layer_guid{get_layer_by_name(cg, "operator")};
+
+  std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
+      fwd_timing = perform_forward_pass_for_computation_graph_instance(
+          instance,
+          this->profiling_settings,
+          this->device_handle,
+          this->iteration_config,
+          this->device_idx);
+  milliseconds_t fwd = fwd_timing.at(operator_layer_guid).value();
+  std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
+      bwd_timing = perform_backward_pass_for_computation_graph_instance(
+          instance,
+          this->profiling_settings,
+          this->device_handle,
+          this->iteration_config,
+          this->device_idx);
+  milliseconds_t bwd = bwd_timing.at(operator_layer_guid).value();
 
   return OpCostMetrics{
       /*forward_runtime=*/fwd,
@@ -123,7 +161,6 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
 
 milliseconds_t LocalCostEstimator::estimate_cost(
     TensorSetMovement const &tensor_set_movement) const {
-
   auto estimate_single_comm_cost =
       [&](MachineSpaceCoordinate const &src,
           MachineSpaceCoordinate const &dst,
@@ -147,11 +184,19 @@ milliseconds_t LocalCostEstimator::estimate_cost(
                 }));
 }
 
-CostEstimator
-    get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) {
-  return CostEstimator::create<LocalCostEstimator>(runtime_arg_config);
+CostEstimator get_local_cost_estimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx) {
+  return CostEstimator::create<LocalCostEstimator>(interconnect_specification,
+                                                   allocator,
+                                                   profiling_settings,
+                                                   device_handle,
+                                                   iteration_config,
+                                                   device_idx);
 }
 
 } // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc
@@ -1,6 +1,4 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
-#include "local-execution/tracked_allocator.h"
+#include "local-execution/cost_estimator/tracked_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
@@ -35,5 +33,3 @@ Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
 }
 
 } // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc
@@ -4,6 +4,7 @@
 #include "pcg/device_id_t.h"
 #include "utils/exception.h"
 #include "utils/optional.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
@@ -26,6 +27,36 @@ LocalTaskArgumentAccessor::LocalTaskArgumentAccessor(
       iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
       device_idx(device_idx) {}
 
+TensorShape
+    LocalTaskArgumentAccessor::get_tensor_shape(TensorSlotName slot) const {
+
+  for (auto const &[backing_slot, accessor] : this->tensor_slots_backing) {
+    bool match = backing_slot.visit<bool>(overload{
+        [&](TaskForwardTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskGradientTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskOptimizerTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskLossTensorParameter const &param) { return false; },
+    });
+
+    if (match) {
+      if (accessor.has<GenericTensorAccessorR>()) {
+        return accessor.get<GenericTensorAccessorR>().shape;
+      } else {
+        return accessor.get<GenericTensorAccessorW>().shape;
+      }
+    }
+  }
+
+  PANIC("Unable to find TensorSlotName in tensor_slots_backing",
+        fmt::to_string(slot));
+}
+
 GenericTensorAccessor
     LocalTaskArgumentAccessor::get_tensor(TaskTensorParameter slot,
                                           Permissions priv) const {