Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
#if 0 // FIXME (Elliott): fix cost estimator

#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H
#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H

#include "compiler/cost_estimator/cost_estimator.h"
#include "kernels/allocation.h"
#include "kernels/device_handle_t.dtg.h"
#include "kernels/profiling_settings.dtg.h"
#include "pcg/device_id_t.dtg.h"
#include "pcg/machine_interconnect_specification.dtg.h"
#include "pcg/optimizer_attrs.dtg.h"
#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h"
#include "task-spec/ff_iteration_config.dtg.h"

namespace FlexFlow {

struct LocalCostEstimator : public ICostEstimator {
explicit LocalCostEstimator(RuntimeArgConfig const &,
MachineInterconnectSpecification const &,
DeviceType);
explicit LocalCostEstimator(
MachineInterconnectSpecification const &interconnect_specification,
Allocator &allocator,
ProfilingSettings const &profiling_settings,
device_handle_t const &device_handle,
FFIterationConfig const &iteration_config,
device_id_t device_idx);

LocalCostEstimator(LocalCostEstimator const &) = delete;
LocalCostEstimator(LocalCostEstimator &&) = delete;
Expand All @@ -24,16 +29,23 @@ struct LocalCostEstimator : public ICostEstimator {
milliseconds_t estimate_cost(TensorSetMovement const &) const override;

private:
RuntimeArgConfig runtime_arg_config;
MachineInterconnectSpecification interconnect_specification;
DeviceType device_type;
Allocator allocator;
ProfilingSettings profiling_settings;
device_handle_t device_handle;
FFIterationConfig iteration_config;
device_id_t device_idx;
};
CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCostEstimator);

CostEstimator get_local_cost_estimator(RuntimeArgConfig const &);
CostEstimator get_local_cost_estimator(
MachineInterconnectSpecification const &interconnect_specification,
Allocator &allocator,
ProfilingSettings const &profiling_settings,
device_handle_t const &device_handle,
FFIterationConfig const &iteration_config,
device_id_t device_idx);

} // namespace FlexFlow

#endif

#endif
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#if 0 // FIXME (Elliott): fix cost estimator

#ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
#define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H

Expand Down Expand Up @@ -35,5 +33,3 @@ size_t get_tracked_memory_usage(Allocator &wrapped_allocator);
} // namespace FlexFlow

#endif

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;

TensorShape get_tensor_shape(TensorSlotName slot) const override;

GenericTensorAccessor get_tensor(TaskTensorParameter slot,
Permissions priv) const override;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,60 +1,75 @@
#if 0 // FIXME (Elliott): fix cost estimator

#include "local-execution/cost_estimator/local_cost_estimator.h"
#include "compiler/machine_mapping/machine_view.dtg.h"
#include "kernels/create_local_allocator_for_device_type.h"
#include "kernels/device.h"
#include "kernels/local_cpu_allocator.h"
#include "kernels/local_cuda_allocator.h"
#include "local-execution/computation_graph_instance/computation_graph_instance.h"
#include "local-execution/cost_estimator/tracked_allocator.h"
#include "op-attrs/computation_graph_op_attrs.h"
#include "op-attrs/pcg_operator_attrs.h"
#include "op-attrs/tensor_slot_name.dtg.h"
#include "pcg/computation_graph.h"
#include "pcg/computation_graph/layer_added_result.dtg.h"
#include "pcg/device_id.h"
#include "pcg/parallel_tensor_attrs.h"
#include "utils/containers/concat_vectors.h"
#include "utils/containers/get_only.h"
#include "utils/containers/map_values.h"
#include "utils/containers/maximum.h"
#include "utils/containers/require_only_key.h"
#include "utils/containers/sum.h"
#include "utils/containers/transform.h"
#include "utils/containers/unordered_set_of.h"
#include "utils/containers/values.h"
#include "utils/exception.h"
#include "utils/optional.h"
#include <optional>

namespace FlexFlow {

LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
: runtime_arg_config(config) {}
LocalCostEstimator::LocalCostEstimator(
MachineInterconnectSpecification const &interconnect_specification,
Allocator &allocator,
ProfilingSettings const &profiling_settings,
device_handle_t const &device_handle,
FFIterationConfig const &iteration_config,
device_id_t device_idx)
: interconnect_specification(interconnect_specification),
allocator(allocator), profiling_settings(profiling_settings),
device_handle(device_handle), iteration_config(iteration_config),
device_idx(device_idx) {}

static ComputationGraph computation_graph_for_local_cost_estimation(
ComputationGraphOpAttrs const &op,
std::vector<ParallelTensorShape> const &inputs,
std::vector<ParallelTensorShape> const &weights,
std::vector<ParallelTensorShape> const &outputs) {
std::unordered_map<TensorSlotName, ParallelTensorShape> const &inputs,
std::unordered_map<TensorSlotName, ParallelTensorShape> const &weights,
std::unordered_map<TensorSlotName, ParallelTensorShape> const &outputs) {
ComputationGraph computation_graph = make_empty_computation_graph();

std::vector<tensor_guid_t> input_tensors;
for (ParallelTensorShape const &input : inputs) {
LayerAddedResult inputs_layer = add_layer(
computation_graph,
LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
std::nullopt},
{},
{});
input_tensors.push_back(get_only(inputs_layer.outputs));
}

std::vector<tensor_guid_t> weight_tensors;
for (ParallelTensorShape const &weight : weights) {
LayerAddedResult weights_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
get_piece_shape(weight),
InitializerAttrs{ZeroInitializerAttrs{}}}},
std::nullopt},
{},
{});
weight_tensors.push_back(get_only(weights_layer.outputs));
}
std::unordered_map<TensorSlotName, tensor_guid_t> input_tensors =
map_values(inputs, [&](ParallelTensorShape const &shape) {
LayerAddedResult inputs_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{
InputAttrs{get_piece_shape(shape)}},
std::nullopt},
{},
{});
return require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
});

std::unordered_map<TensorSlotName, tensor_guid_t> weight_tensors =
map_values(weights, [&](ParallelTensorShape const &shape) {
LayerAddedResult weights_layer =
add_layer(computation_graph,
LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
get_piece_shape(shape),
InitializerAttrs{ZeroInitializerAttrs{}}}},
std::nullopt},
{},
{});
return require_only_key(weights_layer.outputs, TensorSlotName::OUTPUT);
});

// create operator layer
LayerAddedResult operator_layer = add_layer(computation_graph,
Expand All @@ -72,10 +87,13 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
OpCostEstimateKey const &op_cost_estimate_key) const {

PCGOperatorAttrs op = op_cost_estimate_key.op_attrs;
std::vector<ParallelTensorShape> inputs = op_cost_estimate_key.input_shapes;
std::vector<ParallelTensorShape> weights = op_cost_estimate_key.weight_shapes;
std::vector<ParallelTensorShape> outputs = op_cost_estimate_key.output_shapes;
MachineView mv = op_cost_estimate_key.machine_view;
std::unordered_map<TensorSlotName, ParallelTensorShape> inputs =
op_cost_estimate_key.input_shapes;
std::unordered_map<TensorSlotName, ParallelTensorShape> weights =
op_cost_estimate_key.weight_shapes;
std::unordered_map<TensorSlotName, ParallelTensorShape> outputs =
op_cost_estimate_key.output_shapes;
OptimizerAttrs optimizer_attrs = op_cost_estimate_key.optimizer_attrs;

if (is_parallel_op(op) || op.has<InputAttrs>() || op.has<NoopAttrs>() ||
op.has<WeightAttrs>()) {
Expand All @@ -89,30 +107,50 @@ OpCostMetrics LocalCostEstimator::estimate_cost(
// allocate memory
std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
std::make_shared<TrackedAllocator>(create_local_allocator_for_device_type(
runtime_arg_config.kernel_device_type));
get_device_type(this->device_idx)));

layer_guid_t layer_guid = layer_guid_t{Node{0}};

Allocator allocator = Allocator(tracked_allocator_ptr);

ComputationGraph cg = computation_graph_for_local_cost_estimation(
/*op=*/assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs(op)),
/*inputs=*/inputs,
/*weights=*/weights,
/*outputs=*/outputs);

ComputationGraphInstance instance = create_computation_graph_instance(
/*compgraph=*/cg,
/*optimizer_attrs=*/optimizer_attrs,
/*loss_attrs=*/std::nullopt,
/*label_tensor=*/std::nullopt,
/*logit_tensor=*/std::nullopt,
/*input_tensors=*/{},
/*allocator=*/allocator,
/*profiling_settings=*/this->profiling_settings,
/*device_handle=*/this->device_handle,
/*iteration_config=*/this->iteration_config,
/*device_idx=*/this->device_idx);

// execute layer
layer_guid_t operator_layer_guid =
get_layer_by_name(training_cg.computation_graph, "operator");

milliseconds_t fwd = execute_forward(local_backing.local_task_registry,
local_backing.local_tensor_backing,
local_backing.local_args_backing,
get_training_layer_plus_context(
training_cg, operator_layer_guid),
allocator)
.value();
milliseconds_t bwd = execute_backward(local_backing.local_task_registry,
local_backing.local_tensor_backing,
local_backing.local_args_backing,
get_training_layer_plus_context(
training_cg, operator_layer_guid),
allocator)
.value();
dynamic_layer_guid_t operator_layer_guid{get_layer_by_name(cg, "operator")};

std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
fwd_timing = perform_forward_pass_for_computation_graph_instance(
instance,
this->profiling_settings,
this->device_handle,
this->iteration_config,
this->device_idx);
milliseconds_t fwd = fwd_timing.at(operator_layer_guid).value();
std::unordered_map<dynamic_layer_guid_t, std::optional<milliseconds_t>>
bwd_timing = perform_backward_pass_for_computation_graph_instance(
instance,
this->profiling_settings,
this->device_handle,
this->iteration_config,
this->device_idx);
milliseconds_t bwd = bwd_timing.at(operator_layer_guid).value();

return OpCostMetrics{
/*forward_runtime=*/fwd,
Expand All @@ -123,7 +161,6 @@ OpCostMetrics LocalCostEstimator::estimate_cost(

milliseconds_t LocalCostEstimator::estimate_cost(
TensorSetMovement const &tensor_set_movement) const {

auto estimate_single_comm_cost =
[&](MachineSpaceCoordinate const &src,
MachineSpaceCoordinate const &dst,
Expand All @@ -147,11 +184,19 @@ milliseconds_t LocalCostEstimator::estimate_cost(
}));
}

CostEstimator
get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) {
return CostEstimator::create<LocalCostEstimator>(runtime_arg_config);
CostEstimator get_local_cost_estimator(
MachineInterconnectSpecification const &interconnect_specification,
Allocator &allocator,
ProfilingSettings const &profiling_settings,
device_handle_t const &device_handle,
FFIterationConfig const &iteration_config,
device_id_t device_idx) {
return CostEstimator::create<LocalCostEstimator>(interconnect_specification,
allocator,
profiling_settings,
device_handle,
iteration_config,
device_idx);
}

} // namespace FlexFlow

#endif
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#if 0 // FIXME (Elliott): fix cost estimator

#include "local-execution/tracked_allocator.h"
#include "local-execution/cost_estimator/tracked_allocator.h"
#include "kernels/device.h"

namespace FlexFlow {
Expand Down Expand Up @@ -35,5 +33,3 @@ Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
}

} // namespace FlexFlow

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "pcg/device_id_t.h"
#include "utils/exception.h"
#include "utils/optional.h"
#include "utils/overload.h"

namespace FlexFlow {

Expand All @@ -26,6 +27,36 @@ LocalTaskArgumentAccessor::LocalTaskArgumentAccessor(
iteration_config(iteration_config), optimizer_attrs(optimizer_attrs),
device_idx(device_idx) {}

TensorShape
LocalTaskArgumentAccessor::get_tensor_shape(TensorSlotName slot) const {

for (auto const &[backing_slot, accessor] : this->tensor_slots_backing) {
bool match = backing_slot.visit<bool>(overload{
[&](TaskForwardTensorParameter const &param) {
return param.name == slot;
},
[&](TaskGradientTensorParameter const &param) {
return param.name == slot;
},
[&](TaskOptimizerTensorParameter const &param) {
return param.name == slot;
},
[&](TaskLossTensorParameter const &param) { return false; },
});

if (match) {
if (accessor.has<GenericTensorAccessorR>()) {
return accessor.get<GenericTensorAccessorR>().shape;
} else {
return accessor.get<GenericTensorAccessorW>().shape;
}
}
}

PANIC("Unable to find TensorSlotName in tensor_slots_backing",
fmt::to_string(slot));
}

GenericTensorAccessor
LocalTaskArgumentAccessor::get_tensor(TaskTensorParameter slot,
Permissions priv) const {
Expand Down
Loading
Loading