From e30763538a2b050f165cc84a4c2a042a319a311e Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 30 Jan 2026 17:17:42 -0800
Subject: [PATCH 1/3] Update the cost estimator for latest task-spec and
 local-execution.

---
 .../cost_estimator/local_cost_estimator.h     |  34 ++--
 .../cost_estimator/tracked_allocator.h        |   4 -
 .../cost_estimator/local_cost_estimator.cc    | 163 +++++++++++-------
 .../cost_estimator/tracked_allocator.cc       |   6 +-
 .../local-execution/local_cost_estimator.cc   |  72 +++++---
 5 files changed, 179 insertions(+), 100 deletions(-)

diff --git a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
index d07a8b731b..93e28d0986 100644
--- a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
+++ b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
@@ -1,19 +1,23 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H
 #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_interconnect_specification.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
 
 namespace FlexFlow {
 
 struct LocalCostEstimator : public ICostEstimator {
-  explicit LocalCostEstimator(RuntimeArgConfig const &,
-                              MachineInterconnectSpecification const &,
-                              DeviceType);
+  explicit LocalCostEstimator(MachineInterconnectSpecification const &,
+                              Allocator &allocator,
+                              ProfilingSettings const &profiling_settings,
+                              device_handle_t const &device_handle,
+                              FFIterationConfig const &iteration_config,
+                              device_id_t device_idx);
 
   LocalCostEstimator(LocalCostEstimator const &) = delete;
   LocalCostEstimator(LocalCostEstimator &&) = delete;
@@ -24,16 +28,22 @@ struct LocalCostEstimator : public ICostEstimator {
   milliseconds_t estimate_cost(TensorSetMovement const &) const override;
 
 private:
-  RuntimeArgConfig runtime_arg_config;
   MachineInterconnectSpecification interconnect_specification;
-  DeviceType device_type;
+  Allocator allocator;
+  ProfilingSettings profiling_settings;
+  device_handle_t device_handle;
+  FFIterationConfig iteration_config;
+  device_id_t device_idx;
 };
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCostEstimator);
 
-CostEstimator get_local_cost_estimator(RuntimeArgConfig const &);
+CostEstimator get_local_cost_estimator(MachineInterconnectSpecification const &,
+                                       Allocator &,
+                                       ProfilingSettings const &,
+                                       device_handle_t const &,
+                                       FFIterationConfig const &,
+                                       device_id_t);
 
 } // namespace FlexFlow
 
 #endif
-
-#endif
diff --git a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h
index 79a62b628a..0b531f9b3d 100644
--- a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h
@@ -1,5 +1,3 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 
@@ -35,5 +33,3 @@ size_t get_tracked_memory_usage(Allocator &wrapped_allocator);
 } // namespace FlexFlow
 
 #endif
-
-#endif
diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
index 79e2dcafb2..b9cd0c238d 100644
--- a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
+++ b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc
@@ -1,60 +1,75 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
 #include "local-execution/cost_estimator/local_cost_estimator.h"
 #include "compiler/machine_mapping/machine_view.dtg.h"
 #include "kernels/create_local_allocator_for_device_type.h"
 #include "kernels/device.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
+#include "local-execution/computation_graph_instance/computation_graph_instance.h"
 #include "local-execution/cost_estimator/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph/layer_added_result.dtg.h"
+#include "pcg/device_id.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/concat_vectors.h"
-#include "utils/containers/get_only.h"
+#include "utils/containers/map_values.h"
 #include "utils/containers/maximum.h"
+#include "utils/containers/require_only_key.h"
 #include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
+#include "utils/exception.h"
+#include "utils/optional.h"
+#include <optional>
 
 namespace FlexFlow {
 
-LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
-    : runtime_arg_config(config) {}
+LocalCostEstimator::LocalCostEstimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx)
+    : interconnect_specification(interconnect_specification),
+      allocator(allocator), profiling_settings(profiling_settings),
+      device_handle(device_handle), iteration_config(iteration_config),
+      device_idx(device_idx) {}
 
 static ComputationGraph computation_graph_for_local_cost_estimation(
     ComputationGraphOpAttrs const &op,
-    std::vector<ParallelTensorShape> const &inputs,
-    std::vector<ParallelTensorShape> const &weights,
-    std::vector<ParallelTensorShape> const &outputs) {
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &inputs,
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &weights,
+    std::unordered_map<TensorSlotName, ParallelTensorShape> const &outputs) {
   ComputationGraph computation_graph = make_empty_computation_graph();
 
-  std::vector<tensor_guid_t> input_tensors;
-  for (ParallelTensorShape const &input : inputs) {
-    LayerAddedResult inputs_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
-                   std::nullopt},
-        {},
-        {});
-    input_tensors.push_back(get_only(inputs_layer.outputs));
-  }
-
-  std::vector<tensor_guid_t> weight_tensors;
-  for (ParallelTensorShape const &weight : weights) {
-    LayerAddedResult weights_layer =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                                 get_piece_shape(weight),
-                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
-                             std::nullopt},
-                  {},
-                  {});
-    weight_tensors.push_back(get_only(weights_layer.outputs));
-  }
+  std::unordered_map<TensorSlotName, tensor_guid_t> input_tensors =
+      map_values(inputs, [&](ParallelTensorShape const &shape) {
+        LayerAddedResult inputs_layer =
+            add_layer(computation_graph,
+                      LayerAttrs{ComputationGraphOpAttrs{
+                                     InputAttrs{get_piece_shape(shape)}},
+                                 std::nullopt},
+                      {},
+                      {});
+        return require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+      });
+
+  std::unordered_map<TensorSlotName, tensor_guid_t> weight_tensors =
+      map_values(weights, [&](ParallelTensorShape const &shape) {
+        LayerAddedResult weights_layer =
+            add_layer(computation_graph,
+                      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                                     get_piece_shape(shape),
+                                     InitializerAttrs{ZeroInitializerAttrs{}}}},
+                                 std::nullopt},
+                      {},
+                      {});
+        return require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
+      });
 
   // create operator layer
   LayerAddedResult operator_layer = add_layer(computation_graph,
@@ -72,10 +87,13 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
     OpCostEstimateKey const &op_cost_estimate_key) const {
 
   PCGOperatorAttrs op = op_cost_estimate_key.op_attrs;
-  std::vector<ParallelTensorShape> inputs = op_cost_estimate_key.input_shapes;
-  std::vector<ParallelTensorShape> weights = op_cost_estimate_key.weight_shapes;
-  std::vector<ParallelTensorShape> outputs = op_cost_estimate_key.output_shapes;
-  MachineView mv = op_cost_estimate_key.machine_view;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> inputs =
+      op_cost_estimate_key.input_shapes;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> weights =
+      op_cost_estimate_key.weight_shapes;
+  std::unordered_map<TensorSlotName, ParallelTensorShape> outputs =
+      op_cost_estimate_key.output_shapes;
+  OptimizerAttrs optimizer_attrs = op_cost_estimate_key.optimizer_attrs;
 
   if (is_parallel_op(op) || op.has<InputAttrs>() || op.has<NoopAttrs>() ||
       op.has<WeightAttrs>()) {
@@ -89,30 +107,50 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
   // allocate memory
   std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
       std::make_shared<TrackedAllocator>(create_local_allocator_for_device_type(
-          runtime_arg_config.kernel_device_type));
+          get_device_type(this->device_idx)));
 
   layer_guid_t layer_guid = layer_guid_t{Node{0}};
 
   Allocator allocator = Allocator(tracked_allocator_ptr);
 
+  ComputationGraph cg = computation_graph_for_local_cost_estimation(
+      /*op=*/assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs(op)),
+      /*inputs=*/inputs,
+      /*weights=*/weights,
+      /*outputs=*/outputs);
+
+  ComputationGraphInstance instance = create_computation_graph_instance(
+      /*compgraph=*/cg,
+      /*optimizer_attrs=*/optimizer_attrs,
+      /*loss_attrs=*/std::nullopt,
+      /*label_tensor=*/std::nullopt,
+      /*logit_tensor=*/std::nullopt,
+      /*input_tensors=*/{},
+      /*allocator=*/allocator,
+      /*profiling_settings=*/this->profiling_settings,
+      /*device_handle=*/this->device_handle,
+      /*iteration_config=*/this->iteration_config,
+      /*device_idx=*/this->device_idx);
+
   // execute layer
-  layer_guid_t operator_layer_guid =
-      get_layer_by_name(training_cg.computation_graph, "operator");
-
-  milliseconds_t fwd = execute_forward(local_backing.local_task_registry,
-                                       local_backing.local_tensor_backing,
-                                       local_backing.local_args_backing,
-                                       get_training_layer_plus_context(
-                                           training_cg, operator_layer_guid),
-                                       allocator)
-                           .value();
-  milliseconds_t bwd = execute_backward(local_backing.local_task_registry,
-                                        local_backing.local_tensor_backing,
-                                        local_backing.local_args_backing,
-                                        get_training_layer_plus_context(
-                                            training_cg, operator_layer_guid),
-                                        allocator)
-                           .value();
+  dynamic_layer_guid_t operator_layer_guid{get_layer_by_name(cg, "operator")};
+
+  std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
+      fwd_timing = perform_forward_pass_for_computation_graph_instance(
+          instance,
+          this->profiling_settings,
+          this->device_handle,
+          this->iteration_config,
+          this->device_idx);
+  milliseconds_t fwd = fwd_timing.at(operator_layer_guid).value();
+  std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
+      bwd_timing = perform_backward_pass_for_computation_graph_instance(
+          instance,
+          this->profiling_settings,
+          this->device_handle,
+          this->iteration_config,
+          this->device_idx);
+  milliseconds_t bwd = bwd_timing.at(operator_layer_guid).value();
 
   return OpCostMetrics{
       /*forward_runtime=*/fwd,
@@ -123,7 +161,6 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
 
 milliseconds_t LocalCostEstimator::estimate_cost(
     TensorSetMovement const &tensor_set_movement) const {
-
   auto estimate_single_comm_cost =
       [&](MachineSpaceCoordinate const &src,
           MachineSpaceCoordinate const &dst,
@@ -147,11 +184,19 @@ milliseconds_t LocalCostEstimator::estimate_cost(
                 }));
 }
 
-CostEstimator
-    get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) {
-  return CostEstimator::create<LocalCostEstimator>(runtime_arg_config);
+CostEstimator get_local_cost_estimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx) {
+  return CostEstimator::create<LocalCostEstimator>(interconnect_specification,
+                                                   allocator,
+                                                   profiling_settings,
+                                                   device_handle,
+                                                   iteration_config,
+                                                   device_idx);
 }
 
 } // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc
index 2930ba0c86..8424f637c6 100644
--- a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc
+++ b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc
@@ -1,6 +1,4 @@
-#if 0 // FIXME (Elliott): fix cost estimator
-
-#include "local-execution/tracked_allocator.h"
+#include "local-execution/cost_estimator/tracked_allocator.h"
 #include "kernels/device.h"
 
 namespace FlexFlow {
@@ -35,5 +33,3 @@ Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
 }
 
 } // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc
index 788817d3ed..f3dcab7f82 100644
--- a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc
+++ b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc
@@ -1,24 +1,26 @@
-#if 0 // FIXME (Elliott): fix cost estimator
 #include "local-execution/cost_estimator/local_cost_estimator.h"
 #include "compiler/machine_mapping/machine_view.h"
-#include "internal/test_utils.h"
 #include "kernels/device_handle_t.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/cast.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
 #include "pcg/computation_graph_builder.h"
-#include "task-spec/runtime_task_invocation/runtime_arg_config.h"
+#include "pcg/device_id_t.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
-    RuntimeArgConfig runtime_arg_config =
-        cpu_make_runtime_arg_config(EnableProfiling::YES,
-                                    ProfilingSettings{/*warmup_iters=*/0,
-                                                      /*measure_iters=*/1});
+    Allocator allocator = create_local_cpu_memory_allocator();
+    device_handle_t ff_handle = cpu_make_device_handle_t();
+    device_id_t device_idx =
+        make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::CPU);
 
     OptimizerAttrs optimizer_attrs = OptimizerAttrs{
         SGDOptimizerAttrs{
@@ -29,7 +31,20 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config);
+    MachineInterconnectSpecification interconnect_specification{
+        /*inter_node_bandwidth=*/bytes_per_second_t{10000000},
+        /*intra_node_bandwidth=*/bytes_per_second_t{10000000000},
+    };
+
+    CostEstimator cost_estimator = get_local_cost_estimator(
+        /*interconnect_specification=*/interconnect_specification,
+        /*allocator=*/allocator,
+        /*profiling_settings=*/
+        ProfilingSettings{/*warmup_iters=*/0,
+                          /*measure_iters=*/1},
+        /*device_handle=*/ff_handle,
+        /*iteration_config=*/FFIterationConfig{1_p},
+        /*device_idx=*/device_idx);
 
     SUBCASE("estimate operator cost") {
       CastAttrs attrs = CastAttrs{
@@ -46,9 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{
           /*op_attrs=*/PCGOperatorAttrs{attrs},
-          /*input_shapes=*/{input_shape},
+          /*input_shapes=*/{{TensorSlotName::INPUT, input_shape}},
           /*weight_shapes=*/{},
-          /*output_shapes=*/{output_shape},
+          /*output_shapes=*/{{TensorSlotName::OUTPUT, output_shape}},
           /*optimizer_attrs=*/optimizer_attrs,
           /*machine_view=*/
           make_1d_machine_view(
@@ -68,15 +83,17 @@ TEST_SUITE(FF_TEST_SUITE) {
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator (CUDA)") {
+    ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
         /*workSpaceSize=*/1024 * 1024,
         /*allowTensorOpMathConversion=*/true);
 
-    RuntimeArgConfig runtime_arg_config =
-        gpu_make_runtime_arg_config(managed_handle.raw_handle(),
-                                    EnableProfiling::YES,
-                                    ProfilingSettings{/*warmup_iters=*/0,
-                                                      /*measure_iters=*/1});
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    device_id_t device_idx =
+        make_device_id_t_from_idx(nonnegative_int{0}, DeviceType::GPU);
+    device_handle_t ff_handle =
+        gpu_make_device_handle_t(managed_handle.raw_handle());
 
     OptimizerAttrs optimizer_attrs = OptimizerAttrs{
         SGDOptimizerAttrs{
@@ -87,7 +104,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         },
     };
 
-    CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config);
+    MachineInterconnectSpecification interconnect_specification{
+        /*inter_node_bandwidth=*/bytes_per_second_t{10000000},
+        /*intra_node_bandwidth=*/bytes_per_second_t{10000000000},
+    };
+
+    CostEstimator cost_estimator = get_local_cost_estimator(
+        /*interconnect_specification=*/interconnect_specification,
+        /*allocator=*/allocator,
+        /*profiling_settings=*/
+        ProfilingSettings{/*warmup_iters=*/0,
+                          /*measure_iters=*/1},
+        /*device_handle=*/ff_handle,
+        /*iteration_config=*/FFIterationConfig{1_p},
+        /*device_idx=*/device_idx);
 
     SUBCASE("estimate operator cost") {
       positive_int embed_dim = 32_p;
@@ -122,9 +152,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{
           /*op_attrs=*/PCGOperatorAttrs{attrs},
-          /*input_shapes=*/{inputs_shape, inputs_shape, inputs_shape},
-          /*weight_shapes=*/{weights_shape},
-          /*output_shapes=*/{output_shape},
+          /*input_shapes=*/
+          {{TensorSlotName::QUERY, inputs_shape},
+           {TensorSlotName::KEY, inputs_shape},
+           {TensorSlotName::VALUE, inputs_shape}},
+          /*weight_shapes=*/{{TensorSlotName::WEIGHT, weights_shape}},
+          /*output_shapes=*/{{TensorSlotName::OUTPUT, output_shape}},
           /*optimizer_attrs=*/optimizer_attrs,
           /*machine_view=*/
           make_1d_machine_view(
@@ -141,4 +174,3 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
   }
 }
-#endif

From 5038fb32f16b1d6707479d8fb5924490b8218992 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Fri, 6 Feb 2026 14:58:34 -0800
Subject: [PATCH 2/3] Implement get_tensor_shape interface.

---
 .../local_task_argument_accessor.h            |  2 ++
 .../local_task_argument_accessor.cc           | 31 +++++++++++++++++++
 .../itask_argument_accessor.h                 |  2 ++
 .../task_argument_accessor.h                  |  2 +-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index 44844a67f1..638bea247e 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -27,6 +27,8 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;
 
+  TensorShape get_tensor_shape(TensorSlotName slot) const override;
+
   GenericTensorAccessor get_tensor(TaskTensorParameter slot,
                                    Permissions priv) const override;
 
diff --git a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc
index 8a4df61d17..796d122a23 100644
--- a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc
@@ -4,6 +4,7 @@
 #include "pcg/device_id_t.h"
 #include "utils/exception.h"
 #include "utils/optional.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
@@ -26,6 +27,36 @@ LocalTaskArgumentAccessor::LocalTaskArgumentAccessor(
       iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
       device_idx(device_idx) {}
 
+TensorShape
+    LocalTaskArgumentAccessor::get_tensor_shape(TensorSlotName slot) const {
+
+  for (auto const &[backing_slot, accessor] : this->tensor_slots_backing) {
+    bool match = backing_slot.visit<bool>(overload{
+        [&](TaskForwardTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskGradientTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskOptimizerTensorParameter const &param) {
+          return param.name == slot;
+        },
+        [&](TaskLossTensorParameter const &param) { return false; },
+    });
+
+    if (match) {
+      if (accessor.has<GenericTensorAccessorR>()) {
+        return accessor.get<GenericTensorAccessorR>().shape;
+      } else {
+        return accessor.get<GenericTensorAccessorW>().shape;
+      }
+    }
+  }
+
+  PANIC("Unable to find TensorSlotName in tensor_slots_backing",
+        fmt::to_string(slot));
+}
+
 GenericTensorAccessor
     LocalTaskArgumentAccessor::get_tensor(TaskTensorParameter slot,
                                           Permissions priv) const {
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h
index a7d1af4022..3d08101915 100644
--- a/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/task_argument_accessor/itask_argument_accessor.h
@@ -24,6 +24,8 @@ struct ITaskArgumentAccessor {
 
   virtual ~ITaskArgumentAccessor() = default;
 
+  virtual TensorShape get_tensor_shape(TensorSlotName) const = 0;
+
   virtual GenericTensorAccessor get_tensor(TaskTensorParameter,
                                            Permissions priv) const = 0;
 
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor/task_argument_accessor.h
index e350387684..29f3f625f6 100644
--- a/lib/task-spec/include/task-spec/task_argument_accessor/task_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/task_argument_accessor/task_argument_accessor.h
@@ -27,7 +27,7 @@ struct TaskArgumentAccessor {
   OptimizerAttrs get_optimizer_attrs() const;
 
   TensorShape get_tensor_shape(TensorSlotName slot) const {
-    NOT_IMPLEMENTED();
+    return this->ptr->get_tensor_shape(slot);
   }
 
   template <Permissions PRIV>

From 59f6f2499592041dfb829f97ea0b057af1bb3d94 Mon Sep 17 00:00:00 2001
From: Elliott Slaughter <slaughter@cs.stanford.edu>
Date: Thu, 12 Feb 2026 09:13:08 -0800
Subject: [PATCH 3/3] Add back arg names to interface.

---
 .../cost_estimator/local_cost_estimator.h     | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
index 93e28d0986..653067da8a 100644
--- a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
+++ b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h
@@ -12,12 +12,13 @@
 namespace FlexFlow {
 
 struct LocalCostEstimator : public ICostEstimator {
-  explicit LocalCostEstimator(MachineInterconnectSpecification const &,
-                              Allocator &allocator,
-                              ProfilingSettings const &profiling_settings,
-                              device_handle_t const &device_handle,
-                              FFIterationConfig const &iteration_config,
-                              device_id_t device_idx);
+  explicit LocalCostEstimator(
+      MachineInterconnectSpecification const &interconnect_specification,
+      Allocator &allocator,
+      ProfilingSettings const &profiling_settings,
+      device_handle_t const &device_handle,
+      FFIterationConfig const &iteration_config,
+      device_id_t device_idx);
 
   LocalCostEstimator(LocalCostEstimator const &) = delete;
   LocalCostEstimator(LocalCostEstimator &&) = delete;
@@ -37,12 +38,13 @@ struct LocalCostEstimator : public ICostEstimator {
 };
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCostEstimator);
 
-CostEstimator get_local_cost_estimator(MachineInterconnectSpecification const &,
-                                       Allocator &,
-                                       ProfilingSettings const &,
-                                       device_handle_t const &,
-                                       FFIterationConfig const &,
-                                       device_id_t);
+CostEstimator get_local_cost_estimator(
+    MachineInterconnectSpecification const &interconnect_specification,
+    Allocator &allocator,
+    ProfilingSettings const &profiling_settings,
+    device_handle_t const &device_handle,
+    FFIterationConfig const &iteration_config,
+    device_id_t device_idx);
 
 } // namespace FlexFlow