diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 8ff7e3560799..8165979f90fb 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -25,7 +25,9 @@ value of HL_DEBUG_CODEGEN, if any). HL_PERMIT_FAILED_UNROLL - Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. + Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. + Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not + turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS @@ -199,7 +201,15 @@ AutoSchedule::AutoSchedule(const FunctionDAG &dag, Statistics &stats, SearchSpace &search_space, const LoopNestParser *partial_schedule) - : dag{dag}, params{params}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} { + : dag{dag}, + params{params}, + target{target}, + outputs{outputs}, + rng{rng}, + cost_model{cost_model}, + stats{stats}, + search_space{search_space}, + partial_schedule{partial_schedule} { configure_pipeline_features(dag, params, cost_model); } @@ -220,27 +230,26 @@ IntrusivePtr AutoSchedule::optimal_schedule_pass(int beam_size, int expanded = 0; - std::function &&)> enqueue_new_children = - [&](IntrusivePtr &&s) { - // aslog(1) << "\n** Generated child: "; - // s->dump(); - // s->calculate_cost(dag, params, nullptr, true); + std::function &&)> enqueue_new_children = [&](IntrusivePtr &&s) { + // aslog(1) << "\n** Generated child: "; + // s->dump(); + // s->calculate_cost(dag, params, nullptr, true); - // Each child should have one more decision made than its parent state. - internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1); + // Each child should have one more decision made than its parent state. + internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1); - int progress = s->num_decisions_made * beam_size + expanded; - size_t max_progress = dag.nodes.size() * beam_size * 2; + int progress = s->num_decisions_made * beam_size + expanded; + size_t max_progress = dag.nodes.size() * beam_size * 2; - // Update the progress bar - tick.set(double(progress) / max_progress); - s->penalized = false; + // Update the progress bar + tick.set(double(progress) / max_progress); + s->penalized = false; - ++stats.num_states_added; + ++stats.num_states_added; - // Add the state to the list of states to evaluate - q.emplace(std::move(s)); - }; + // Add the state to the list of states to evaluate + q.emplace(std::move(s)); + }; std::unique_ptr target_loop_nest; @@ -600,7 +609,15 @@ void generate_schedule(const std::vector &outputs, std::mt19937 rng{(uint32_t)params.random_dropout_seed}; SearchSpace search_space{dag, params, target, rng, cost_model.get(), stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()}; + AutoSchedule autoschedule{dag, + params, + target, + outputs, + rng, + cost_model.get(), + stats, + search_space, + partial_schedule.get()}; // Run beam search optimal = autoschedule.optimal_schedule(params.beam_size); @@ -656,7 +673,8 @@ void generate_schedule(const std::vector &outputs, aslog(1) << "Total cost model evaluation time (ms): " << stats.total_cost_model_evaluation_time() << "\n"; aslog(1) << "Average cost model evaluation time (ms): " << stats.average_cost_model_evaluation_time() << "\n"; std::chrono::duration total_time = timer.elapsed(); - aslog(1) << "Time taken for autoscheduler (s): " << std::chrono::duration_cast(total_time).count() / 1000.0 << '\n'; + aslog(1) << "Time taken for autoscheduler (s): " + << std::chrono::duration_cast(total_time).count() / 1000.0 << '\n'; } struct Anderson2021 { @@ -717,7 +735,15 @@ void find_and_apply_schedule(FunctionDAG &dag, } SearchSpace search_space{dag, params, target, rng, cost_model, stats, partial_schedule.get()}; - AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()}; + AutoSchedule autoschedule{dag, + params, + target, + outputs, + rng, + cost_model, + stats, + search_space, + partial_schedule.get()}; IntrusivePtr optimal = autoschedule.optimal_schedule(beam_size); diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp index 3eede5993d98..51bf21f21780 100644 --- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp +++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp @@ -51,8 +51,7 @@ void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::Func const int pipeline_feat_size = head1_w * head1_h; // We ignore the first seven pipeline features in the cost // model. It's just a mask of which types are in use. - static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) == - sizeof(int) * pipeline_feat_size, + static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) == sizeof(int) * pipeline_feat_size, "Incorrect size for pipeline features"); int num_stages = 0; for (const auto &n : dag.nodes) { @@ -231,15 +230,22 @@ float DefaultCostModel::backprop(const Runtime::Buffer &true_runtim batch_id, pipeline_feat_queue, schedule_feat_queue, - weights.head1_filter, weights.head1_bias, - weights.head2_filter, weights.head2_bias, - weights.conv1_filter, weights.conv1_bias, - learning_rate, timestep++, + weights.head1_filter, + weights.head1_bias, + weights.head2_filter, + weights.head2_bias, + weights.conv1_filter, + weights.conv1_bias, + learning_rate, + timestep++, fastest_idx, true_runtimes.alias(), - head1_filter_update, head1_bias_update, - head2_filter_update, head2_bias_update, - conv1_filter_update, conv1_bias_update, + head1_filter_update, + head1_bias_update, + head2_filter_update, + head2_bias_update, + conv1_filter_update, + conv1_bias_update, dst, dst_costs_per_stage, loss); diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp index e4dae392d13d..1a057187dcbd 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.cpp +++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp @@ -239,10 +239,10 @@ class Featurizer : public IRVisitor { void visit_memory_access(const std::string &name, Type t, const vector &args, PipelineFeatures::AccessType type) { // Compute matrix of partial derivatives of args w.r.t. loop params LoadJacobian matrix(args.size(), stage.loop.size(), 1); - vector ones_per_row(args.size(), 0), - zeros_per_row(args.size(), 0), - ones_per_col(stage.loop.size(), 0), - zeros_per_col(stage.loop.size(), 0); + vector ones_per_row(args.size(), 0); + vector zeros_per_row(args.size(), 0); + vector ones_per_col(stage.loop.size(), 0); + vector zeros_per_col(stage.loop.size(), 0); bool is_pointwise = args.size() == stage.loop.size(); for (size_t i = 0; i < args.size(); i++) { for (size_t j = 0; j < stage.loop.size(); j++) { @@ -295,7 +295,8 @@ class Featurizer : public IRVisitor { public: Featurizer(Function &func, FunctionDAG::Node::Stage &stage) - : func(func), stage(stage) { + : func(func), + stage(stage) { } void visit_store_args(const std::string &name, Type t, vector args) { diff --git a/src/autoschedulers/anderson2021/FunctionDAG.h b/src/autoschedulers/anderson2021/FunctionDAG.h index ef7e57651462..4e08a917f8ca 100644 --- a/src/autoschedulers/anderson2021/FunctionDAG.h +++ b/src/autoschedulers/anderson2021/FunctionDAG.h @@ -39,7 +39,8 @@ struct OptionalRational { OptionalRational() = default; OptionalRational(int64_t n, int64_t d) - : numerator(n), denominator(d) { + : numerator(n), + denominator(d) { } void operator+=(const OptionalRational &other) { @@ -137,7 +138,9 @@ class LoadJacobian { public: LoadJacobian(size_t producer_storage_dims, size_t consumer_loop_dims, int64_t count) - : c(count), rows(producer_storage_dims), cols(consumer_loop_dims) { + : c(count), + rows(producer_storage_dims), + cols(consumer_loop_dims) { coeffs.resize(rows * cols); } @@ -283,7 +286,9 @@ class Span { } Span(int64_t a, int64_t b, bool c) - : min_(a), max_(b), constant_extent_(c) { + : min_(a), + max_(b), + constant_extent_(c) { } Span() = default; Span(const Span &other) = default; diff --git a/src/autoschedulers/anderson2021/GPUMemInfo.h b/src/autoschedulers/anderson2021/GPUMemInfo.h index 7140a3be0ba1..40c000e1b8c1 100644 --- a/src/autoschedulers/anderson2021/GPUMemInfo.h +++ b/src/autoschedulers/anderson2021/GPUMemInfo.h @@ -175,7 +175,10 @@ struct Strides { struct GlobalAccessAccumulator { GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose) - : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} { + : bytes_per_access{bytes_per_access}, + dimensions{dimensions}, + strides{strides}, + verbose{verbose} { } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { @@ -257,7 +260,10 @@ struct GlobalAccessAccumulator { struct SharedAccessAccumulator { SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose) - : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} { + : bytes_per_access{bytes_per_access}, + dimensions{dimensions}, + strides{strides}, + verbose{verbose} { } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { @@ -348,7 +354,8 @@ struct SharedAccessAccumulator { struct LocalAccessAccumulator { LocalAccessAccumulator(int bytes_per_access, bool verbose) - : bytes_per_access{bytes_per_access}, verbose{verbose} { + : bytes_per_access{bytes_per_access}, + verbose{verbose} { } void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) { diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp index be0209affb38..04850bafe633 100644 --- a/src/autoschedulers/anderson2021/LoopNest.cpp +++ b/src/autoschedulers/anderson2021/LoopNest.cpp @@ -176,7 +176,13 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f, vector vectorized_indices; this->get_stage_sizes(f, stage_sizes, pure_dims, vectorized_indices); internal_assert(!stage_sizes.empty()); - auto tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, (int)(stage_sizes[0].size() - 1), vectorized_indices, true, false); + auto tilings = generate_gpu_tilings(stage_sizes, + pure_dims, + max_size, + (int)(stage_sizes[0].size() - 1), + vectorized_indices, + true, + false); bool made_child = false; for (const auto &t : tilings) { LoopNest *new_parent = new LoopNest; @@ -309,7 +315,9 @@ GPUMemoryType LoopNest::get_gpu_memory_type(bool in_block, bool in_thread, bool return GPUMemoryType::Global; } -std::vector LoopNest::unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const { +std::vector LoopNest::unrolled_loops(const Target &target, + const LoopNest *parent, + const LoopNest *grandparent) const { internal_assert(innermost); const auto &grandparent_bounds = grandparent->get_bounds(node); std::vector unrolled(parent->size.size(), 0); @@ -358,14 +366,14 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target NodeMap &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const { - for (const auto *alloc_node : store_at) { const auto &store_site = sites.get(&alloc_node->stages[0]); if (store_site.gpu_store_memory_type != GPUMemoryType::Local) { continue; } - can_be_promoted_to_registers.get_or_create(alloc_node) = store_site.is_constant_allocation && store_site.allocation_size <= get_register_mem_alloc_limit(); + can_be_promoted_to_registers.get_or_create(alloc_node) = store_site.is_constant_allocation && + store_site.allocation_size <= get_register_mem_alloc_limit(); } for (const auto &c : children) { @@ -380,7 +388,8 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target continue; } - can_be_promoted_to_registers.get(e->producer) = can_be_promoted_to_registers.get(e->producer) && accessed_at_constant_indices(unrolled, e); + can_be_promoted_to_registers.get(e->producer) = can_be_promoted_to_registers.get(e->producer) && + accessed_at_constant_indices(unrolled, e); } } } @@ -578,7 +587,11 @@ int LoopNest::get_vectorized_loop_index_from_pure_stage(const LoopNest &root) co // Get the stride over "node's" storage for a unit increment in the vectorized loop's // index -double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const { +double LoopNest::storage_stride(const LoadJacobian &jac, + int innermost_storage_dim, + const FunctionDAG::Node *storage_node, + const Bound &store_bounds, + const LoopNest &root) const { internal_assert(innermost_storage_dim >= 0); // The node's storage dimensions (from innermost outward) @@ -613,7 +626,10 @@ double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_d } // Shared mem accesses with stride 1 will likely be vectorized -bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const { +bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + int innermost_dim, + int loop_index) const { for (int i = 0; i < accessed->dimensions; i++) { auto stride = jac(i, loop_index); if (i == innermost_dim) { @@ -628,7 +644,12 @@ bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, c return true; } -bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const { +bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + bool accessed_has_been_scheduled, + int innermost_dim, + int loop_index, + const GPUMemoryType &mem_type) const { if (loop_index < 0 || mem_type != GPUMemoryType::Shared) { return false; } @@ -637,7 +658,12 @@ bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const Functio return can_vectorize_access_for_innermost_dim(jac, accessed, innermost_dim, loop_index); } -int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose) const { +int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + bool accessed_has_been_scheduled, + int innermost_dim, + const GPUMemoryType &mem_type, + bool verbose) const { int vector_size = 1; if (mem_type != GPUMemoryType::Shared) { return vector_size; @@ -704,6 +730,7 @@ int LoopNest::vectorized_access_size(size_t loop_index, bool verbose) const { return 1; } + double LoopNest::compute_local_mem_stride(double stride, double bytes) const { // Each word is 4 bytes so adjust the stride based // on width of data being accessed @@ -718,11 +745,18 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const { // Get the stride over "node's" storage and its element-wise stride for a unit // increment in the given thread loops -Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const { +Strides LoopNest::compute_strides(const LoadJacobian &jac, + int innermost_storage_dim, + const FunctionDAG::Node *storage_node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + bool verbose) const { internal_assert(innermost_storage_dim >= 0); if (verbose) { - aslog(2) << "\nstrides: " << node->func.name() << " (stage = " << stage->index << ") loading from " << storage_node->func.name() << " ->\n"; + aslog(2) << "\nstrides: " << node->func.name() << " (stage = " + << stage->index << ") loading from " + << storage_node->func.name() << " ->\n"; if (aslog::aslog_level() >= 2) { jac.dump(""); } @@ -811,7 +845,9 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage return strides; } -bool LoopNest::all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const { +bool LoopNest::all_strides_exist(const LoadJacobian &jac, + const FunctionDAG::Node *storage_node, + const LoopNest &root) const { int v = get_vectorized_loop_index_from_pure_stage(root); for (int i = 0; i < storage_node->dimensions; i++) { @@ -838,7 +874,20 @@ int LoopNest::get_actual_vector_dim(const Bound &store_bounds) const { return vector_dim; } -void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose) const { +void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const GPULoopInfo &gpu_loop_info, + const std::vector &inner_serial_loop_extents, + const Sites &consumer_site, + ScheduleFeatures &feat, + const LoopNest *parent, + const LoopNest &root, + GlobalMemInfo &global_mem_loads, + SharedMemInfo &shared_mem_loads, + LocalMemInfo &local_mem_loads, + bool verbose) const { if (consumer_site.is_stored_in_registers()) { return; } @@ -906,7 +955,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } feat.shared_mem_store_efficiency = shared_mem_info.efficiency(); - internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency)) << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency << " for " << node->func.name(); + internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency)) + << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency + << " for " << node->func.name(); } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Global) { if (verbose) { @@ -928,7 +979,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } feat.global_mem_store_efficiency = global_mem_info.efficiency(); - internal_assert(in_range_zero_one(feat.global_mem_store_efficiency)) << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency << " for " << node->func.name(); + internal_assert(in_range_zero_one(feat.global_mem_store_efficiency)) + << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency + << " for " << node->func.name(); } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) { auto local_mem_info = compute_mem_store_info( @@ -945,7 +998,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } // feat.local_mem_store_efficiency = local_mem_info.efficiency(); - // internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency << " for " << node->func.name(); + // internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) + // << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency + // << " for " << node->func.name(); } if (verbose) { @@ -959,7 +1014,11 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) { mem_type = "local"; } - aslog(2) << "END MEM ACCESS " << mem_type << "_mem_" << type << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name; + aslog(2) << "END MEM ACCESS " + << mem_type << "_mem_" << type + << ". consumer: " << consumer_name + << "_s" << stage->index + << "; producer: " << consumer_name; if (!jac.all_coeffs_exist()) { aslog(2) << " (not all coeffs exist)"; } @@ -968,7 +1027,14 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_ } template -void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, + const FunctionDAG::Node *node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + int innermost_dim, + double num_requests_per_warp, + MemInfoType &mem_info, + bool verbose) const { int bytes_per_access = node->bytes_per_point; // If the consumer is a scalar and is compute_root, then it will not be @@ -992,10 +1058,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const Accumulator accumulator(bytes_per_access, dimensions, strides, verbose); thread_info->for_each_thread_id_in_first_warp(accumulator); - accumulator.add_access_info( - num_requests, - mem_info, - false); + accumulator.add_access_info(num_requests, mem_info, false); if (verbose) { aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; @@ -1015,22 +1078,40 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const Accumulator accumulator(bytes_per_access, dimensions, strides, verbose); thread_info->for_each_thread_id_in_tail_warp(accumulator); - accumulator.add_access_info( - num_requests_per_warp, - mem_info, - true); + accumulator.add_access_info(num_requests_per_warp, mem_info, true); if (verbose) { aslog(2) << "END tail warp\n\n"; } } -template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; - -template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const; +template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, + const FunctionDAG::Node *node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + int innermost_dim, + double num_requests_per_warp, + MemInfoType &mem_info, + bool verbose) const; + +template void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, + const FunctionDAG::Node *node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + int innermost_dim, + double num_requests_per_warp, + MemInfoType &mem_info, + bool verbose) const; template<> -void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose) const { +void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, + const FunctionDAG::Node *node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + int innermost_dim, + double num_requests_per_warp, + MemInfoType &mem_info, + bool verbose) const { int bytes_per_access = node->bytes_per_point; // If the consumer is a scalar and is compute_root, then it will not be @@ -1047,10 +1128,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian & LocalAccessAccumulator accumulator(bytes_per_access, verbose); thread_info->for_each_thread_id_in_first_warp(accumulator); - accumulator.add_access_info( - num_requests, - mem_info, - false); + accumulator.add_access_info(num_requests, mem_info, false); if (verbose) { aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n"; @@ -1070,17 +1148,20 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian & LocalAccessAccumulator accumulator(bytes_per_access, verbose); thread_info->for_each_thread_id_in_tail_warp(accumulator); - accumulator.add_access_info( - num_requests_per_warp, - mem_info, - true); + accumulator.add_access_info(num_requests_per_warp, mem_info, true); if (verbose) { aslog(2) << "END tail warp\n\n"; } } -std::pair LoopNest::compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const { +std::pair +LoopNest::compute_local_mem_store_features(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const LoopNest &root, + double serial_loop_extents) const { // Assume worst case serialized loads if the stride is unknown if (!all_strides_exist(jac, node, root)) { double stride = compute_local_mem_stride(32.0, node->bytes_per_point); @@ -1095,21 +1176,60 @@ std::pair LoopNest::compute_local_mem_store_features(const LoadJ } template -MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const { +MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const ThreadInfo *thread_info, + double serial_loop_extents, + bool verbose) const { MemInfoType mem_info; - compute_num_mem_accesses_per_block(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose); + compute_num_mem_accesses_per_block(jac, + node, + consumer_store_bounds, + thread_info, + consumer_innermost_dim, + serial_loop_extents, + mem_info, verbose); return mem_info; } -template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const; - -template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const; +template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const ThreadInfo *thread_info, + double serial_loop_extents, + bool verbose) const; + +template MemInfoType LoopNest::compute_mem_store_info(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const ThreadInfo *thread_info, + double serial_loop_extents, + bool verbose) const; template -void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { +void LoopNest::compute_mem_load_features(const LoadJacobian &jac, + int producer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &producer_store_bounds, + bool producer_has_been_scheduled, + const ThreadInfo *thread_info, + MemInfoType &mem_info, + double points_accessed_per_thread, + bool verbose) const { if (producer_has_been_scheduled) { - compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); + compute_num_mem_accesses_per_block(jac, + node, + producer_store_bounds, + thread_info, + producer_innermost_dim, + points_accessed_per_thread, + mem_info, + verbose); return; } @@ -1121,7 +1241,14 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_i for (int i = 0; i < node->dimensions; i++) { MemInfoType info; - compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, i, points_accessed_per_thread, info, verbose); + compute_num_mem_accesses_per_block(jac, + node, + producer_store_bounds, + thread_info, + i, + points_accessed_per_thread, + info, + verbose); if (i == 0 || info.num_transactions() < min_required_accesses) { min_info = info; min_required_accesses = info.num_transactions(); @@ -1161,7 +1288,14 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, MemInfoType &mem_info, double points_accessed_per_thread, bool verbose) const { - compute_num_mem_accesses_per_block(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose); + compute_num_mem_accesses_per_block(jac, + node, + producer_store_bounds, + thread_info, + producer_innermost_dim, + points_accessed_per_thread, + mem_info, + verbose); } // Assumes block, serial, thread or block, thread nesting @@ -1176,7 +1310,8 @@ const LoopNest *LoopNest::get_enclosing_block(const LoopNest *parent, const Loop return grandparent; } - internal_error << "Invalid nesting: " << stringify(parent->gpu_label) << ", " << stringify(grandparent->gpu_label) << "\n"; + internal_error << "Invalid nesting: " << stringify(parent->gpu_label) << ", " << stringify(grandparent->gpu_label) + << "\n"; return nullptr; } @@ -1252,12 +1387,16 @@ void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopIn features.block_occupancy = thread_info->block_occupancy(); features.num_threads_per_block = thread_info->num_threads; - internal_assert(in_range_zero_one(features.block_occupancy)) << "Invalid block occupancy: " << features.block_occupancy; - internal_assert(in_range_zero_one(features.warp_lane_utilization)) << "Invalid warp utilization: " << features.warp_lane_utilization; + internal_assert(in_range_zero_one(features.block_occupancy)) + << "Invalid block occupancy: " << features.block_occupancy; + internal_assert(in_range_zero_one(features.warp_lane_utilization)) + << "Invalid warp utilization: " << features.warp_lane_utilization; } // Assume that when a block is active, all its warps are active -void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const { +void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, + ScheduleFeatures &feat, + const GPULoopInfo &gpu_loop_info) const { // Only compute these features for stage's that actually have a block // loop if (node != gpu_loop_info.current_block_loop->node) { @@ -1282,7 +1421,10 @@ void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params ¶ms feat.max_block_occupancy = (double)max_active_blocks / (double)active_block_hardware_limit; } -void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const { +void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params ¶ms, + const Target &target, + int64_t total_shared_mem_alloc_size, + ScheduleFeatures &feat) const { if (!is_gpu_block(target)) { return; } @@ -1295,10 +1437,12 @@ void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params ¶ms, co internal_assert(feat.shared_mem_occupancy <= 1) << "Invalid shared mem occupancy: " << feat.shared_mem_occupancy; if (total_shared_mem_alloc_size > 0) { - auto shared_mem_max_active_blocks = std::min(active_block_hardware_limit, shared_mem_sm_limit / total_shared_mem_alloc_size); + auto shared_mem_max_active_blocks = std::min(active_block_hardware_limit, + shared_mem_sm_limit / total_shared_mem_alloc_size); feat.shared_mem_block_limit_factor = (double)shared_mem_max_active_blocks / (double)active_block_hardware_limit; - internal_assert(feat.shared_mem_block_limit_factor <= 1) << "Invalid shared mem block limit factor: " << feat.shared_mem_block_limit_factor; + internal_assert(feat.shared_mem_block_limit_factor <= 1) + << "Invalid shared mem block limit factor: " << feat.shared_mem_block_limit_factor; } } @@ -1329,20 +1473,21 @@ std::pair LoopNest::find_innermost_and_paren return {child, parent}; } -int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, - const Target &target, - const GPULoopInfo &gpu_loop_info, - const std::vector &edge_chain, - const LoadJacobian &jac, - const LoopNest *parent, - const LoopNest *grandparent, - int64_t n, - const ScheduleFeatures &feat, - const LoadJacobian &serial_jac, - bool producer_has_been_scheduled, - int producer_innermost_dim, - const GPUMemoryType &mem_type, - bool verbose) const { +int64_t LoopNest::points_accessed_per_thread( + const Anderson2021Params ¶ms, + const Target &target, + const GPULoopInfo &gpu_loop_info, + const std::vector &edge_chain, + const LoadJacobian &jac, + const LoopNest *parent, + const LoopNest *grandparent, + int64_t n, + const ScheduleFeatures &feat, + const LoadJacobian &serial_jac, + bool producer_has_been_scheduled, + int producer_innermost_dim, + const GPUMemoryType &mem_type, + bool verbose) const { std::unique_ptr innermost_parent_clone = std::make_unique(); innermost_parent_clone->copy_from(*parent); @@ -1406,7 +1551,15 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, } } - IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles(tiling, grandparent, params, target, true, false, false, rvars_to_move_inward); + IntrusivePtr innermost_parent = innermost_parent_clone->parallelize_in_tiles( + tiling, + grandparent, + params, + target, + true, + false, + false, + rvars_to_move_inward); const auto &bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain); int64_t num_points = 1; @@ -1424,7 +1577,8 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, } // There are 2 ways to calculate the number of points accessed: - // 1. The region_required of the producer in the non-LICM unrolled loops * the loop extents of the non-LICM loops that cannot be unrolled + // 1. The region_required of the producer in the non-LICM unrolled loops * the loop extents of the non-LICM loops + // that cannot be unrolled int64_t points_accessed_by_region_required = num_points * product_of_non_licm_non_unrolled_extents; // 2. The number of points computed according to 'n' (the number of @@ -1443,13 +1597,12 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, points_accessed = points_accessed_by_loop_extents; if (mem_type == GPUMemoryType::Shared) { - int vector_size = parent->vectorized_load_access_size( - serial_jac, - producer, - producer_has_been_scheduled, - producer_innermost_dim, - mem_type, - verbose); + int vector_size = parent->vectorized_load_access_size(serial_jac, + producer, + producer_has_been_scheduled, + producer_innermost_dim, + mem_type, + verbose); if (verbose) { aslog(2) << "\n"; @@ -1467,7 +1620,8 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, points_accessed *= gpu_loop_info.total_outer_serial_extents; - int64_t total_inner_serial_extents_outside_realization = gpu_loop_info.get_total_inner_serial_extents_outside_realization(this); + int64_t total_inner_serial_extents_outside_realization = + gpu_loop_info.get_total_inner_serial_extents_outside_realization(this); // If you have a realization inside a serial loop e.g. // f 80 gpu_block @@ -1497,7 +1651,11 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params ¶ms, return points_accessed; } -int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const { +int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, + const LoopNest *parent, + const ScheduleFeatures &feat, + const LoadJacobian &jac, + int producer_dims) const { // Is this load loop-invariant over an // unrolled block? If so, we amortize the // number of loads to account for LICM. @@ -1526,7 +1684,8 @@ int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const Loo return amortization; } -void LoopNest::memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const { +void LoopNest::memoize_points_computed_minimum(StageMap &memoized_features, + const StageMap *features) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); const auto &inlined_feat = features->get(&(f->stages[0])); @@ -1565,9 +1724,7 @@ vector> LoopNest::collect_producers(const StageMap &sites) done.insert(e->producer); const auto &site = sites.get(&(e->producer->stages[0])); if (site.store->is_root()) { - int vector_dim = (e->producer->is_input ? 0 : - site.produce != nullptr ? site.produce->vector_dim : - -1); + int vector_dim = (e->producer->is_input ? 0 : (site.produce != nullptr ? site.produce->vector_dim : -1)); producers.emplace_back(e->producer->id, vector_dim); } else if (site.produce != nullptr) { // Computation must be nested inside this task or inlined into it. @@ -1586,9 +1743,8 @@ uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap> producers = collect_producers(sites); // Sort them according to node id - std::sort(producers.begin(), producers.end(), [](const pair &a, const pair &b) { - return a.first < b.first; - }); + std::sort(producers.begin(), producers.end(), + [](const pair &a, const pair &b) { return a.first < b.first; }); uint64_t store_root_hash = 0; for (const auto &p : producers) { @@ -1607,7 +1763,8 @@ void LoopNest::collect_stages(std::set &stages } } -void LoopNest::memoize_features(StageMap &memoized_features, const StageMap *features) const { +void LoopNest::memoize_features(StageMap &memoized_features, + const StageMap *features) const { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); if (memoized_features.contains(&(f->stages[0]))) { @@ -1643,7 +1800,8 @@ void LoopNest::compute_working_set_from_features(int64_t *working_set, *working_set += working_set_here; } -void LoopNest::recompute_inlined_features(const StageMap &sites, StageMap *features) const { +void LoopNest::recompute_inlined_features(const StageMap &sites, + StageMap *features) const { for (const auto &c : children) { c->recompute_inlined_features(sites, features); } @@ -1665,9 +1823,8 @@ void LoopNest::recompute_inlined_features(const StageMap &sites, StageMap inlined_feat.inlined_calls += intermediate.inlined_calls; inlined_feat.num_scalars += intermediate.num_scalars; if (inlined_feat.innermost_pure_loop_extent > 0) { - inlined_feat.innermost_pure_loop_extent = - std::min(inlined_feat.innermost_pure_loop_extent, - intermediate.innermost_pure_loop_extent); + inlined_feat.innermost_pure_loop_extent = std::min(inlined_feat.innermost_pure_loop_extent, + intermediate.innermost_pure_loop_extent); } else { inlined_feat.innermost_pure_loop_extent = intermediate.innermost_pure_loop_extent; } @@ -1730,8 +1887,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, size_t i = size[idx]; loop_instances *= i; if (stage->loop[idx].pure && !in_impure) { - if (params.parallelism > 1 && - (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { + if (params.parallelism > 1 && (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) { // Either we've picked our parallel tiling, or // it's not yet determined. Assume we'll not split // any loops and just stop after we hit the @@ -1843,7 +1999,25 @@ void LoopNest::compute_features(const FunctionDAG &dag, ++stats.num_memoization_misses; } - c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose); + c->compute_features(dag, + params, + target, + sites, + subinstances, + parallelism, + this, + parent, + root, + gpu_loop_info, + use_memoized_features, + total_shared_mem_alloc_sizes, + &working_set_here, + &working_set_here_local_constant, + &working_set_here_local_dynamic, + features, + stats, + verbose); + if (use_memoized_features) { c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id); c->memoize_features(c->features[hash_of_producers], features); @@ -1906,7 +2080,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, for (auto *e : node->outgoing_edges) { points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls; } - feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined); + feat.points_computed_minimum = std::min(feat.points_computed_minimum, + (double)points_computed_minimum_if_inlined); } // When memoizing, we need to recompute features for inlined Funcs @@ -2142,9 +2317,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, } if (innermost) { - bool parent_unrolled = - (feat.innermost_pure_loop_extent <= get_unroll_limit(target) && - parent->node == node); + bool parent_unrolled = (feat.innermost_pure_loop_extent <= get_unroll_limit(target) && parent->node == node); if (parent_unrolled) { parent_unrolled = all(unrolled_loops(target, parent, grandparent)); @@ -2171,7 +2344,6 @@ void LoopNest::compute_features(const FunctionDAG &dag, int64_t global_lines_loaded = 0, shared_lines_loaded = 0, local_lines_loaded = 0, register_lines_loaded = 0; int64_t global_bytes_loaded_per_thread = 0, shared_bytes_loaded_per_thread = 0, register_bytes_loaded_per_thread = 0; int64_t global_lines_loaded_per_thread = 0, shared_lines_loaded_per_thread = 0, register_lines_loaded_per_thread = 0; - ; int64_t global_allocation_bytes_loaded = 0, shared_allocation_bytes_loaded = 0; GlobalMemInfo global_mem_loads; SharedMemInfo shared_mem_loads; @@ -2193,21 +2365,20 @@ void LoopNest::compute_features(const FunctionDAG &dag, inner_serial_loop_extents_computed = true; auto store_jac = *stage->store_jacobian; - compute_gpu_store_features( - store_jac, - vector_dim, - stage->node, - bounds, - gpu_loop_info, - inner_serial_loop_extents, - consumer_site, - feat, - parent, - root, - global_mem_loads, - shared_mem_loads, - local_mem_loads, - verbose); + compute_gpu_store_features(store_jac, + vector_dim, + stage->node, + bounds, + gpu_loop_info, + inner_serial_loop_extents, + consumer_site, + feat, + parent, + root, + global_mem_loads, + shared_mem_loads, + local_mem_loads, + verbose); } // The parallel loop of the consumer @@ -2352,24 +2523,41 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; + aslog(2) << "BEGIN MEM ACCESS shared_mem_load. " + << "consumer: " << consumer_name + << "_s" << stage->index + << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Shared, verbose); + int64_t points_accessed = points_accessed_per_thread(params, + target, + gpu_loop_info, + edge_chain, + jac.first, + parent, + grandparent, + n, + feat, + serial_jac.first, + producer_has_been_scheduled, + producer_innermost_dim, + GPUMemoryType::Shared, + verbose); + + compute_mem_load_features(jac.first, + producer_innermost_dim, + e->producer, + producer_store_bounds, + producer_has_been_scheduled, + gpu_loop_info.get_thread_info(), + shared_mem_loads, + points_accessed, + verbose); - compute_mem_load_features( - jac.first, - producer_innermost_dim, - e->producer, - producer_store_bounds, - producer_has_been_scheduled, - gpu_loop_info.get_thread_info(), - shared_mem_loads, - points_accessed, - verbose); if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; - aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() + << "; producer: " << e->producer->func.name(); if (!jac.first.all_coeffs_exist()) { aslog(1) << " (not all coeffs exist)"; } @@ -2383,25 +2571,39 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; + aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" + << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Global, verbose); - - compute_mem_load_features( - jac.first, - producer_innermost_dim, - e->producer, - producer_store_bounds, - producer_has_been_scheduled, - gpu_loop_info.get_thread_info(), - global_mem_loads, - points_accessed, - verbose); + int64_t points_accessed = points_accessed_per_thread(params, + target, + gpu_loop_info, + edge_chain, + jac.first, + parent, + grandparent, + n, + feat, + serial_jac.first, + producer_has_been_scheduled, + producer_innermost_dim, + GPUMemoryType::Global, + verbose); + + compute_mem_load_features(jac.first, + producer_innermost_dim, + e->producer, + producer_store_bounds, + producer_has_been_scheduled, + gpu_loop_info.get_thread_info(), + global_mem_loads, + points_accessed, + verbose); if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; - aslog(2) << "END MEM ACCESS global_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + aslog(2) << "END MEM ACCESS global_mem_load. consumer: " << node->func.name() + << "; producer: " << e->producer->func.name(); if (!jac.first.all_coeffs_exist()) { aslog(2) << " (not all coeffs exist)"; } @@ -2423,10 +2625,24 @@ void LoopNest::compute_features(const FunctionDAG &dag, sanitize_names(consumer_name); std::string producer_name = e->producer->func.name(); sanitize_names(producer_name); - aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n"; + aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" + << stage->index << "; producer: " << producer_name << "\n"; } - int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Local, verbose); + int64_t points_accessed = points_accessed_per_thread(params, + target, + gpu_loop_info, + edge_chain, + jac.first, + parent, + grandparent, + n, + feat, + jac.first, + producer_has_been_scheduled, + producer_innermost_dim, + GPUMemoryType::Local, + verbose); compute_mem_load_features( jac.first, @@ -2441,7 +2657,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (verbose) { aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n"; - aslog(2) << "END MEM ACCESS local_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name(); + aslog(2) << "END MEM ACCESS local_mem_load. consumer: " << node->func.name() + << "; producer: " << e->producer->func.name(); if (!jac.first.all_coeffs_exist()) { aslog(2) << " (not all coeffs exist)"; } @@ -2460,7 +2677,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Now look at the shapes of the regions read from // the producer at various sites. - int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1; + int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1, + max_task_extent = 1; for (int i = 0; i < e->producer->dimensions; i++) { auto p = bounds->region_required(i); auto compute_p = producer_compute_bounds->region_computed(i); @@ -2469,7 +2687,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, // Check some invariants internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n"; - internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n"; + internal_assert(compute_p.min() <= compute_p.max()) + << compute_p.min() << " " << compute_p.max() << "\n"; internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n"; int64_t thread_extent = 1; @@ -2521,7 +2740,9 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (!e->producer->is_input) { const int64_t producer_store_instances = - producer_has_been_scheduled ? features->get_or_create(&(e->producer->stages[0])).num_realizations : site.num_realizations; + producer_has_been_scheduled ? + features->get_or_create(&(e->producer->stages[0])).num_realizations : + site.num_realizations; internal_assert(producer_store_instances > 0); @@ -2621,7 +2842,8 @@ void LoopNest::compute_features(const FunctionDAG &dag, internal_assert(global_bytes_loaded >= 0) << "Negative global bytes loaded: " << global_bytes_loaded << "\n"; internal_assert(shared_bytes_loaded >= 0) << "Negative shared bytes loaded: " << shared_bytes_loaded << "\n"; internal_assert(local_bytes_loaded >= 0) << "Negative local bytes loaded: " << local_bytes_loaded << "\n"; - internal_assert(register_bytes_loaded >= 0) << "Negative register bytes loaded: " << register_bytes_loaded << "\n"; + internal_assert(register_bytes_loaded >= 0) + << "Negative register bytes loaded: " << register_bytes_loaded << "\n"; feat.global_allocation_bytes_read_per_realization = global_allocation_bytes_loaded; feat.shared_allocation_bytes_read_per_realization = shared_allocation_bytes_loaded; @@ -2637,24 +2859,29 @@ void LoopNest::compute_features(const FunctionDAG &dag, if (!at_pure_production) { // Also pessimistically assume this update definition relies on the entirety of the produced region so far. // TODO: This overbills scatters, or writes to a sub-window. - internal_assert(feat.bytes_at_production >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n"; + internal_assert(feat.bytes_at_production >= 0) + << "Negative bytes at production: " << feat.bytes_at_production << "\n"; const auto &consumer_site = sites.get(&node->stages[0]); if (consumer_site.is_stored_in_global_mem()) { feat.unique_global_bytes_read_per_realization += feat.bytes_at_production; - feat.unique_global_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.unique_global_lines_read_per_realization += + feat.bytes_at_production / feat.innermost_bytes_at_production; feat.global_allocation_bytes_read_per_realization += feat.bytes_at_production; } else if (consumer_site.is_stored_in_shared_mem()) { feat.unique_shared_bytes_read_per_realization += feat.bytes_at_production; - feat.unique_shared_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.unique_shared_lines_read_per_realization += + feat.bytes_at_production / feat.innermost_bytes_at_production; feat.shared_allocation_bytes_read_per_realization += feat.bytes_at_production; } else if (consumer_site.is_stored_in_local_mem()) { // feat.unique_local_bytes_read_per_realization += feat.bytes_at_production; - // feat.unique_local_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; - // feat.local_allocation_bytes_read_per_realization += feat.bytes_at_production; + // feat.unique_local_lines_read_per_realization += feat.bytes_at_production / + // feat.innermost_bytes_at_production; feat.local_allocation_bytes_read_per_realization += + // feat.bytes_at_production; } else if (consumer_site.is_stored_in_registers()) { feat.unique_register_bytes_read_per_realization += feat.bytes_at_production; - feat.unique_register_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production; + feat.unique_register_lines_read_per_realization += + feat.bytes_at_production / feat.innermost_bytes_at_production; feat.register_allocation_bytes_read_per_realization += feat.bytes_at_production; } else { internal_assert(false); @@ -2675,8 +2902,10 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.points_computed_per_production = subinstances / feat.num_productions; - feat.unique_bytes_read_per_point = global_bytes_loaded + shared_bytes_loaded + local_bytes_loaded + register_bytes_loaded; - feat.unique_lines_read_per_point = global_lines_loaded + shared_lines_loaded + local_lines_loaded + register_bytes_loaded; + feat.unique_bytes_read_per_point = + global_bytes_loaded + shared_bytes_loaded + local_bytes_loaded + register_bytes_loaded; + feat.unique_lines_read_per_point = + global_lines_loaded + shared_lines_loaded + local_lines_loaded + register_bytes_loaded; feat.num_global_mem_loads_per_block = global_mem_loads.num_transactions(); feat.global_mem_load_efficiency = global_mem_loads.efficiency(); @@ -2684,9 +2913,11 @@ void LoopNest::compute_features(const FunctionDAG &dag, feat.num_shared_mem_loads_per_block = shared_mem_loads.num_transactions(); feat.shared_mem_load_efficiency = shared_mem_loads.efficiency(); - internal_assert(in_range_zero_one(feat.global_mem_load_efficiency)) << "Invalid global mem load efficiency: " << feat.global_mem_load_efficiency; + internal_assert(in_range_zero_one(feat.global_mem_load_efficiency)) + << "Invalid global mem load efficiency: " << feat.global_mem_load_efficiency; - internal_assert(in_range_zero_one(feat.shared_mem_load_efficiency)) << "Invalid shared mem load efficiency: " << feat.shared_mem_load_efficiency; + internal_assert(in_range_zero_one(feat.shared_mem_load_efficiency)) + << "Invalid shared mem load efficiency: " << feat.shared_mem_load_efficiency; } // Track features for inlined Funcs @@ -2698,8 +2929,7 @@ void LoopNest::compute_features(const FunctionDAG &dag, inlined_feat.num_scalars += it.value() * feat.num_scalars; if (inlined_feat.innermost_pure_loop_extent > 0) { inlined_feat.innermost_pure_loop_extent = - std::min(inlined_feat.innermost_pure_loop_extent, - feat.innermost_pure_loop_extent); + std::min(inlined_feat.innermost_pure_loop_extent, feat.innermost_pure_loop_extent); } else { inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent; } @@ -2764,16 +2994,19 @@ void LoopNest::compute_features(const FunctionDAG &dag, // required of 'g' should be 1 point for each point of 'out' but get_bounds() // will also include the edge 'g' -> 'f' and give the result 201 points for every point // of 'out') -Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const { +Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, + const vector &edge_chain) const { internal_assert(!edge_chain.empty()); internal_assert(edge_chain[0]->consumer == stage) - << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's node. But the given edge chain begins with " << edge_chain[0]->consumer->node->func.name() - << " not " << node->func.name(); + << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's " + "node. But the given edge chain begins with " + << edge_chain[0]->consumer->node->func.name() << " not " << node->func.name(); internal_assert(edge_chain.back()->producer == f) - << "get_bounds_along_edge_chain must be called with an edge chain that ends with the given node. But the given edge chain ends with " << edge_chain.back()->producer->func.name() - << " not " << f->func.name(); + << "get_bounds_along_edge_chain must be called with an edge chain that ends with the given node. But the given " + "edge chain ends with " + << edge_chain.back()->producer->func.name() << " not " << f->func.name(); vector bounds; BoundContents *bound; @@ -2841,9 +3074,8 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { bound->region_required(i) = f->estimated_region_required[i]; } } else { - internal_assert(!f->outgoing_edges.empty()) - << "No consumers of " << f->func.name() - << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n"; + internal_assert(!f->outgoing_edges.empty()) << "No consumers of " << f->func.name() << " at loop over " + << (is_root() ? "root" : node->func.name()) << "\n"; auto init = Span::empty_span(); for (int i = 0; i < f->dimensions; i++) { bound->region_required(i) = init; @@ -2851,9 +3083,7 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const { for (const auto *e : f->outgoing_edges) { // Ignore consumers outside of this loop nest - if (!is_root() && - (stage != e->consumer) && - (!stage->downstream_of(*(e->consumer->node)))) { + if (!is_root() && (stage != e->consumer) && (!stage->downstream_of(*(e->consumer->node)))) { continue; } const auto &c_bounds = get_bounds(e->consumer->node); @@ -3165,9 +3395,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f, internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n"; - if (f->dimensions && - node->size[i] >= 1 && - f->stages[s].loop[i].var == f->func.args()[v]) { + if (f->dimensions && node->size[i] >= 1 && f->stages[s].loop[i].var == f->func.args()[v]) { node->vectorized_loop_index = (int)i; vector_size = (int64_t)(node->stage->vector_size); single_point->loops(s, i).set_extent(vector_size); @@ -3239,7 +3467,6 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const vector &rvars_to_move_inward) const { - // Split this loop and move factors to the inner loop LoopNest *inner = new LoopNest, *outer = new LoopNest; inner->node = outer->node = node; @@ -3476,11 +3703,7 @@ vector> LoopNest::compute_in_tiles(const FunctionDA can_compute_here = can_compute_here || (in_threads_loop && search_space_options.compute_at_thread()); // Place the computation directly inside this loop (provided it's not a SIMD loop) - if (!innermost && - (!in_realization || - size.empty() || - vector_dim == -1 || - size[vector_dim] == 1) && + if (!innermost && (!in_realization || size.empty() || vector_dim == -1 || size[vector_dim] == 1) && can_compute_here) { std::unique_ptr r{new LoopNest}; @@ -3527,7 +3750,16 @@ vector> LoopNest::compute_in_tiles(const FunctionDA in_threads_loop |= (children[child]->gpu_label == GPU_parallelism::Thread); // we must pass down union thread count constraints computed at block level when computing further in - auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts); + auto opts = children[child]->compute_in_tiles(f, + this, + params, + target, + search_space_options, + v, + store_here, + in_threads_loop, + false, + union_counts); for (IntrusivePtr &n : opts) { // (Only valid if one child calls f) Push the // computation into the child. Possibly leaving @@ -3643,7 +3875,8 @@ bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node *pro return false; } -void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest) const { +void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, + const LoopNest *compute_root_loop_nest) const { if (is_root()) { for (const auto &c : children) { descendants.emplace(c->stage, {}); @@ -3719,8 +3952,7 @@ void LoopNest::apply(LoopLevel here, state->vars.push_back(fv); } // Bubble the innermost pure dimension to the front of the pure dimensions - for (int i = vectorized_loop_index - 1; - i >= 0 && state->vars[i].pure; i--) { + for (int i = vectorized_loop_index - 1; i >= 0 && state->vars[i].pure; i--) { std::swap(state->vars[i], state->vars[i + 1]); } state_map.emplace(stage, std::unique_ptr(state)); @@ -3792,8 +4024,7 @@ void LoopNest::apply(LoopLevel here, // stage's types and will often be 1, in which case we // don't want to vectorize the loop if (!target.has_gpu_feature() || stage->vector_size > 1) { - state.schedule_source - << "\n .vectorize(" << v.var.name() << ")"; + state.schedule_source << "\n .vectorize(" << v.var.name() << ")"; s.vectorize(v.var); v.vectorized = true; state.vectorized = true; @@ -3849,7 +4080,8 @@ void LoopNest::apply(LoopLevel here, } auto tail_strategy = pure_var_tail_strategy; - // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead. + // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if + // instead. // If the factor evenly divides the parent extent, then // no tail strategy is needed @@ -3863,13 +4095,9 @@ void LoopNest::apply(LoopLevel here, } s.split(parent.var, parent.var, inner, (int)factor, tail_strategy); - state.schedule_source - << "\n .split(" - << parent.var.name() << ", " - << parent.var.name() << ", " - << inner.name() << ", " - << factor << ", " - << "TailStrategy::" << tail_strategy << ")"; + state.schedule_source << "\n .split(" << parent.var.name() << ", " << parent.var.name() + << ", " << inner.name() << ", " << factor << ", " + << "TailStrategy::" << tail_strategy << ")"; v = parent; parent.extent = size[parent.index]; v.constant_extent = (!parent.var.is_rvar && parent.exists); @@ -3927,7 +4155,8 @@ void LoopNest::apply(LoopLevel here, if (!found) { here = LoopLevel(node->func, Var::outermost()); } - // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n"; + // internal_assert(found) << "Could not find appropriate compute_at location for children of " << + // node->func.name() << "\n"; state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end()); } } @@ -3984,7 +4213,8 @@ void LoopNest::apply(LoopLevel here, } } -void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const NodeMap &all_inlined) const { +void LoopNest::update_producers_to_be_staged(StageScheduleState &state, + const NodeMap &all_inlined) const { std::vector>> pending; std::vector edge_chain; pending.emplace_back(stage, edge_chain); @@ -4017,7 +4247,8 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No continue; } - if (other_stage_has_same_producer(e->producer) || producer_computed_here_or_further_in(e->producer) || !e->all_load_jacobian_coeffs_exist()) { + if (other_stage_has_same_producer(e->producer) || producer_computed_here_or_further_in(e->producer) || + !e->all_load_jacobian_coeffs_exist()) { continue; } @@ -4053,7 +4284,8 @@ bool LoopNest::has_valid_thread_extents() const { return true; } -void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, NodeMap &inlined_nodes) const { +void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, + NodeMap &inlined_nodes) const { if (innermost) { for (auto it = inlined.begin(); it != inlined.end(); it++) { const auto *f = it.key(); diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h index b315d460e9c7..b0dee6cedf24 100644 --- a/src/autoschedulers/anderson2021/LoopNest.h +++ b/src/autoschedulers/anderson2021/LoopNest.h @@ -29,21 +29,25 @@ using NodeMap = PerfectHashMap; template using StageMap = PerfectHashMap; -enum class GPU_parallelism { Block, - Thread, - Serial, - Simd, - Parallelized, - None }; +enum class GPU_parallelism { + Block, + Thread, + Serial, + Simd, + Parallelized, + None +}; std::string stringify(GPU_parallelism label); // inlined => func is inlined so has no memory store location -enum class GPUMemoryType { Global, - Shared, - Local, - Registers, - Inlined }; +enum class GPUMemoryType { + Global, + Shared, + Local, + Registers, + Inlined +}; bool may_subtile(const Anderson2021Params ¶ms); @@ -234,9 +238,13 @@ struct LoopNest { } }; - GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const; + GPUMemoryType get_gpu_memory_type(bool in_block, + bool in_thread, + bool is_inlined = false) const; - std::vector unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const; + std::vector unrolled_loops(const Target &target, + const LoopNest *parent, + const LoopNest *grandparent) const; void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap &sites, @@ -244,7 +252,8 @@ struct LoopNest { const LoopNest *grandparent, const LoopNest *parent) const; - bool promote_allocs_to_registers(const Target &target, StageMap &sites) const; + bool promote_allocs_to_registers(const Target &target, + StageMap &sites) const; // Compute all the sites of interest for each pipeline stage void get_sites(const Target &target, @@ -265,7 +274,9 @@ struct LoopNest { } } - bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const; + bool exceeds_serial_extents_limit(const Target &target, + const LoopNest *parent, + bool in_threads_loop) const; bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const; @@ -279,39 +290,105 @@ struct LoopNest { // Get the stride over "node's" storage for a unit increment in the vectorized loop's // index - double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const; - - Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose = false) const; - - bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const; + double storage_stride(const LoadJacobian &jac, + int innermost_storage_dim, + const FunctionDAG::Node *storage_node, + const Bound &store_bounds, + const LoopNest &root) const; + + Strides compute_strides(const LoadJacobian &jac, + int innermost_storage_dim, + const FunctionDAG::Node *storage_node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + bool verbose = false) const; + + bool all_strides_exist(const LoadJacobian &jac, + const FunctionDAG::Node *storage_node, + const LoopNest &root) const; int get_actual_vector_dim(const Bound &store_bounds) const; - void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const; - - bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const; - - bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const; - - int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; - - int vectorized_access_size(size_t loop_index, bool verbose = false) const; + void compute_gpu_store_features(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const GPULoopInfo &gpu_loop_info, + const std::vector &inner_serial_loop_extents, + const Sites &consumer_site, + ScheduleFeatures &feat, + const LoopNest *parent, + const LoopNest &root, + GlobalMemInfo &global_mem_loads, + SharedMemInfo &shared_mem_loads, + LocalMemInfo &local_mem_loads, + bool verbose = false) const; + + bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + int innermost_dim, + int loop_index) const; + + bool can_vectorize_store_access(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + bool accessed_has_been_scheduled, + int innermost_dim, + int loop_index, + const GPUMemoryType &mem_type) const; + + int vectorized_load_access_size(const LoadJacobian &jac, + const FunctionDAG::Node *accessed, + bool accessed_has_been_scheduled, + int innermost_dim, + const GPUMemoryType &mem_type, + bool verbose = false) const; + + int vectorized_access_size(size_t loop_index, + bool verbose = false) const; template - void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType &mem_info, bool verbose = false) const; - - std::pair compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const; + void compute_num_mem_accesses_per_block(const LoadJacobian &jac, + const FunctionDAG::Node *node, + const Bound &store_bounds, + const ThreadInfo *thread_info, + int innermost_dim, + double num_requests_per_warp, + MemInfoType &mem_info, + bool verbose = false) const; + + std::pair compute_local_mem_store_features(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const LoopNest &root, + double serial_loop_extents) const; template - MemInfoType compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const; + MemInfoType compute_mem_store_info(const LoadJacobian &jac, + int consumer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &consumer_store_bounds, + const ThreadInfo *thread_info, + double serial_loop_extents, + bool verbose) const; template - void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType &mem_info, double serial_loop_extents, bool verbose = false) const; - - double compute_local_mem_stride(double stride, double bytes) const; + void compute_mem_load_features(const LoadJacobian &jac, + int producer_innermost_dim, + const FunctionDAG::Node *node, + const Bound &producer_store_bounds, + bool producer_has_been_scheduled, + const ThreadInfo *thread_info, + MemInfoType &mem_info, + double serial_loop_extents, + bool verbose = false) const; + + double compute_local_mem_stride(double stride, + double bytes) const; // Assumes block, serial, thread or block, thread nesting - const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const; + const LoopNest *get_enclosing_block(const LoopNest *parent, + const LoopNest *grandparent) const; std::pair get_block_and_serial_extents(const LoopNest *block) const; @@ -319,20 +396,44 @@ struct LoopNest { bool has_thread_loop_descendant() const; - void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const; + void compute_warp_features(ScheduleFeatures &features, + const GPULoopInfo &gpu_loop_info) const; // Assume that when a block is active, all its warps are active - void compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const; + void compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, + ScheduleFeatures &feat, + const GPULoopInfo &gpu_loop_info) const; - void compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const; + void compute_shared_mem_occupancy(const Anderson2021Params ¶ms, + const Target &target, + int64_t total_shared_mem_alloc_size, + ScheduleFeatures &feat) const; std::pair find_innermost_and_parent() const; - int64_t points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const; - - int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const; - - void memoize_points_computed_minimum(StageMap &memoized_features, const StageMap *features) const; + int64_t points_accessed_per_thread(const Anderson2021Params ¶ms, + const Target &target, + const GPULoopInfo &gpu_loop_info, + const std::vector &edge_chain, + const LoadJacobian &jac, + const LoopNest *parent, + const LoopNest *grandparent, + int64_t n, + const ScheduleFeatures &feat, + const LoadJacobian &serial_jac, + bool producer_has_been_scheduled, + int producer_innermost_dim, + const GPUMemoryType &mem_type, + bool verbose) const; + + int64_t compute_licm_amortization(const LoopNest *innermost, + const LoopNest *parent, + const ScheduleFeatures &feat, + const LoadJacobian &jac, + int producer_dims) const; + + void memoize_points_computed_minimum(StageMap &memoized_features, + const StageMap *features) const; vector> collect_producers(const StageMap &sites) const; @@ -340,12 +441,14 @@ struct LoopNest { void collect_stages(std::set &stages) const; - void memoize_features(StageMap &memoized_features, const StageMap *features) const; + void memoize_features(StageMap &memoized_features, + const StageMap *features) const; void compute_working_set_from_features(int64_t *working_set, const StageMap *features) const; - void recompute_inlined_features(const StageMap &sites, StageMap *features) const; + void recompute_inlined_features(const StageMap &sites, + StageMap *features) const; std::pair compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const; @@ -389,7 +492,8 @@ struct LoopNest { // consumers along the given edge chain), from which we know what region // would be computed if it were scheduled here and what its loop nest // would be. - Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector &edge_chain) const; + Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, + const vector &edge_chain) const; void dump() const; @@ -443,13 +547,16 @@ struct LoopNest { bool move_all_rvars_inward = true, const vector &rvars_to_move_inward = {}) const; - int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const; + int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, + bool in_threads_loop = false) const; int64_t get_total_constant_local_mem_alloc_size() const; // All store ats further in than the block level must be fixed // sized allocations. This method checks if f will require a dynamic // allocation - bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const; + bool requires_dynamic_allocation(const FunctionDAG::Node *f, + const Target &target, + bool in_threads_loop) const; // Return all possible ways to compute f in tiles somewhere within // this loop nest. @@ -501,18 +608,19 @@ struct LoopNest { size_t index = 0; // Some flags. - bool innermost_pure_dim = false, - outermost = false, - parallel = false, - exists = false, - pure = false, - constant_extent = false; + bool innermost_pure_dim = false; + bool outermost = false; + bool parallel = false; + bool exists = false; + bool pure = false; + bool constant_extent = false; bool vectorized = false; bool gpu_threads = false; FuncVar() - : orig(Var()), var(Var()) { + : orig(Var()), + var(Var()) { } }; const FunctionDAG::Node *node; @@ -529,7 +637,8 @@ struct LoopNest { vector ordered_vars; vector gpu_thread_extents; - NodeMap>>> producers_to_be_staged; + NodeMap>>> + producers_to_be_staged; // From outermost in vector ancestors; @@ -544,8 +653,10 @@ struct LoopNest { int num_serial_loops() const; bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const; - void update_producers_to_be_staged(StageScheduleState &state, const NodeMap &all_inlined) const; - bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const; + void update_producers_to_be_staged(StageScheduleState &state, + const NodeMap &all_inlined) const; + bool region_computed_shrinks(const FunctionDAG::Node *f, + const LoopNest *parent) const; // Apply the schedule represented by this loop nest to a Halide pipeline. void apply(LoopLevel here, @@ -558,18 +669,21 @@ struct LoopNest { std::vector &ancestors, const NodeMap &all_inlined) const; - double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const; + double max_idle_lane_wastage(const Target &target, + GPULoopInfo gpu_loop_info) const; bool has_valid_thread_extents() const; - void collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, NodeMap &inlined_nodes) const; + void collect_nodes_that_should_be_inlined(const NodeMap &nodes_to_freeze, + NodeMap &inlined_nodes) const; void collect_all_inlined(NodeMap &all_inlined) const; int64_t product_of_self_and_descendants(int loop_index) const; int64_t product_of_descendants(int loop_index) const; - void get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const; + void get_stages_computed_in_each_compute_root_loop(StageMap> &descendants, + const LoopNest *compute_root_loop_nest = nullptr) const; }; struct Filter { @@ -577,7 +691,8 @@ struct Filter { bool logging = false; explicit Filter(const LoopNest *loop_nest) - : loop_nest{loop_nest}, logging{enable_filter_printing()} { + : loop_nest{loop_nest}, + logging{enable_filter_printing()} { if (logging) { std::cerr << "\nState filtered: \n"; loop_nest->dump(); diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index 278a6d30808d..87cd7cf880bd 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -13,11 +13,19 @@ SearchSpace::SearchSpace(const FunctionDAG &dag, CostModel *cost_model, Statistics &stats, const LoopNestParser *partial_schedule) - : dag{dag}, params{params}, target{target}, search_space_options{params.search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, partial_schedule{partial_schedule} { + : dag{dag}, + params{params}, + target{target}, + search_space_options{params.search_space_options}, + rng{rng}, + cost_model{cost_model}, + stats{stats}, + partial_schedule{partial_schedule} { memoized_compute_root_blocks.make_large(dag.nodes.size()); } -void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) { +void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, + LoopNest *new_root) { int vector_dim = -1; bool loop_nest_found = false; for (auto &c : new_root->children) { @@ -141,8 +149,7 @@ vector SearchSpace::filter_parallel_tile_option } const double tasks_per_core = ((double)total) / params.parallelism; o.idle_core_wastage = std::max(o.idle_core_wastage, - std::ceil(tasks_per_core) / - tasks_per_core); + std::ceil(tasks_per_core) / tasks_per_core); } } } @@ -391,7 +398,15 @@ void SearchSpace::generate_children(const IntrusivePtr &state, std::unordered_map secondary_options; for (int vector_dim : vector_dims) { Timer timer; - auto tile_options = root->compute_in_tiles(node, nullptr, params, target, search_space_options, vector_dim, false, false, is_pre_pass); + auto tile_options = root->compute_in_tiles(node, + nullptr, + params, + target, + search_space_options, + vector_dim, + false, + false, + is_pre_pass); stats.compute_in_tiles_time += timer.elapsed(); timer.restart(); @@ -490,7 +505,13 @@ void SearchSpace::generate_children(const IntrusivePtr &state, // at root level sibling thread counts are in separate blocks, extents are irrelevant vector max_size((int)(stage_sizes[0].size()), 1); - auto block_tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, node->dimensions - 1, vectorized_indices, false, true); + auto block_tilings = generate_gpu_tilings(stage_sizes, + pure_dims, + max_size, + node->dimensions - 1, + vectorized_indices, + false, + true); // If no options, create a thread tiling as large as possible with block size (1,1,1). // This can happen if the loops are too small to generate desired gpu tiles. @@ -517,7 +538,10 @@ void SearchSpace::generate_children(const IntrusivePtr &state, double prev_idle_core_wastage = 0; for (const auto &o : options) { - if (!params.randomize_tilings && num_children >= 1 && o.idle_core_wastage > 1.2 && o.idle_core_wastage != prev_idle_core_wastage) { + if (!params.randomize_tilings && + num_children >= 1 && + o.idle_core_wastage > 1.2 && + o.idle_core_wastage != prev_idle_core_wastage) { // We have considered several options, and the // remaining ones leave lots of cores idle. break; @@ -606,9 +630,8 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr &best) { internal_assert(n.first >= 0); } - std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), [](const std::pair &a, const std::pair &b) { - return a.second < b.second; - }); + std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), + [](const std::pair &a, const std::pair &b) { return a.second < b.second; }); size_t num_to_freeze = num_nodes - std::log2(num_nodes); NodeMap nodes_to_freeze; @@ -631,7 +654,8 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr &best) { } } -vector> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, const FunctionDAG::Node *node) const { +vector> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, + const FunctionDAG::Node *node) const { std::vector vec_dim_serial_sizes; pure_stage->generate_vec_dim_serial_tilings(vec_dim_serial_sizes); diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h index 1e80c0e1760f..b9dddb4c7f04 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.h +++ b/src/autoschedulers/anderson2021/SearchSpace.h @@ -69,7 +69,8 @@ struct SearchSpace { vector filter_thread_tile_options(vector> &loop_nests) const; - void memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root); + void memoize_blocks(const FunctionDAG::Node *node, + LoopNest *new_root); bool add_states_from_memoized_blocks(const IntrusivePtr &state, std::function &&)> &accept_child, @@ -84,7 +85,8 @@ struct SearchSpace { void freeze_lowest_cost_stages(const IntrusivePtr &best); - vector> generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, const FunctionDAG::Node *node) const; + vector> generate_compute_root_serial_tilings(const IntrusivePtr &pure_stage, + const FunctionDAG::Node *node) const; bool add_child(const IntrusivePtr &state, const IntrusivePtr &new_root, diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp index 60a990dff536..e58507c5b7b8 100644 --- a/src/autoschedulers/anderson2021/State.cpp +++ b/src/autoschedulers/anderson2021/State.cpp @@ -19,15 +19,21 @@ uint64_t State::structural_hash(int depth) const { } // Compute the parent and depth of every loop nest node -void State::compute_loop_nest_parents(map> &p, - const LoopNest *here, int depth) const { +void State::compute_loop_nest_parents(LoopNestMap &p, + const LoopNest *here, + int depth) const { for (const auto &c : here->children) { p.emplace(c.get(), pair{here, depth}); compute_loop_nest_parents(p, c.get(), depth + 1); } } -const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params ¶ms, const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const { +const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params ¶ms, + const LoopNestMap &parent, + const FunctionDAG::Node &node, + const LoopNest *loop, + const LoopNest *root, + StageMap &total_shared_mem_alloc_sizes) const { std::vector ancestors; // Innermost loop nests are never considered as compute locations @@ -102,7 +108,8 @@ const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params & return candidate; } -int64_t State::total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const { +int64_t State::total_loop_extents_of_ancestors(const LoopNestMap &parent, + const LoopNest *loop) const { int64_t total = 1; if (loop->is_root()) { @@ -125,7 +132,9 @@ int64_t State::total_loop_extents_of_ancestors(const map> &parent, const LoopNest *a, const LoopNest *b) const { +const LoopNest *State::deepest_common_ancestor(const LoopNestMap &parent, + const LoopNest *a, + const LoopNest *b) const { if (a->is_root()) { return a; } @@ -343,7 +352,8 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest) } } -IntrusivePtr State::get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const { +IntrusivePtr State::get_root_for_features(const Anderson2021Params ¶ms, + const Target &target) const { if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) { return root; } @@ -352,7 +362,8 @@ IntrusivePtr State::get_root_for_features(const Anderson2021Para // We copy the loop nest in 2 cases: // - If the current loop nest has compute root loops without blocks (it is - // in phase 1 and the outer loops are marked 'none'), we split the loop into blocks and threads so we can compute meaningful features + // in phase 1 and the outer loops are marked 'none'), we split the loop into blocks and threads so we can compute + // meaningful features // - If there are serial loops inside blocks without a surrounding // thread loop nest, we create a surrounding thread loop nest with // extents 1 (which Halide will do when the schedule is compiled) so @@ -361,7 +372,9 @@ IntrusivePtr State::get_root_for_features(const Anderson2021Para return new_root; } -void State::set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const { +void State::set_gpu_store_site(const LoopNestMap &parent, + const LoopNest *loop, + LoopNest::Sites &site) const { // If site.store is inside a block but outside a loop, the // GPU store site should instead be the block because the shared // mem allocation will be hoisted @@ -393,7 +406,12 @@ void State::set_gpu_store_site(const map *features, Statistics &stats, bool verbose) const { +bool State::compute_featurization(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + StageMap *features, + Statistics &stats, + bool verbose) const { auto feature_root = get_root_for_features(params, target); StageMap sites; @@ -426,7 +444,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para // For the unscheduled nodes, give them sites as deep as they // could possibly be. We'll ignore the possibility of inlining // them for now. - map> parent; + LoopNestMap parent; compute_loop_nest_parents(parent, feature_root.get(), 0); for (const auto &n : dag.nodes) { if (sites.contains(&(n.stages[0]))) { @@ -474,14 +492,17 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para } } } - internal_assert(loop) - << "Could not compute plausible site for unscheduled Func: " - << n.func.name() << "\n"; + internal_assert(loop) << "Could not compute plausible site for unscheduled Func: " << n.func.name() << "\n"; // If 'loop' would never be considered as a compute location (i.e. by // LoopNest::compute_in_tiles()), walk up the loop nest until we reach a // location that would be considered - loop = deepest_valid_compute_location(params, parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes); + loop = deepest_valid_compute_location(params, + parent, + n, + loop, + feature_root.get(), + total_shared_mem_alloc_sizes); int64_t num_realizations = total_loop_extents_of_ancestors(parent, loop); for (const auto &stage : n.stages) { @@ -501,7 +522,24 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para Timer timer; feature_root->dump(); - feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, GPULoopInfo(feature_root.get()), true, total_shared_mem_alloc_sizes, nullptr, nullptr, nullptr, features, stats, verbose); + feature_root->compute_features(dag, + params, + target, + sites, + 1, + 1, + nullptr, + nullptr, + *feature_root, + GPULoopInfo(feature_root.get()), + true, + total_shared_mem_alloc_sizes, + nullptr, + nullptr, + nullptr, + features, + stats, + verbose); stats.featurization_time += timer.elapsed(); ++stats.num_featurizations; @@ -509,15 +547,17 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para for (const auto &n : dag.nodes) { if (sites.get(&(n.stages[0])).produce == nullptr) { internal_assert(!features->contains(&(n.stages[0]))) - << "Somehow an input or unscheduled node ended up in the featurization: " - << n.func.name() << "\n"; + << "Somehow an input or unscheduled node ended up in the featurization: " << n.func.name() << "\n"; } } return true; } -void State::save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const { +void State::save_featurization(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + std::ostream &out) const { StageMap features; Statistics stats; compute_featurization(dag, params, target, &features, stats); @@ -547,7 +587,8 @@ void State::save_featurization(const FunctionDAG &dag, const Anderson2021Params } } -bool State::contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const { +bool State::contains_store_at(const set &outermost_store_at, + const IntrusivePtr &parent) const { for (const auto &c : parent->children) { if (!c->store_at.empty()) { return true; @@ -594,7 +635,8 @@ bool State::exceeds_serial_extents_limit(const Target &target) const { return root->exceeds_serial_extents_limit(target, nullptr, false); } -int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const { +int64_t State::get_shared_mem_alloc_size(const LoopNest *block, + const LoopNest *loop) const { int64_t result = 0; if (loop->gpu_label == GPU_parallelism::Thread) { @@ -622,7 +664,8 @@ int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest * return result; } -bool State::exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const { +bool State::exceeds_shared_memory_limit(const Anderson2021Params ¶ms, + const Target &target) const { if (!target.has_gpu_feature()) { return false; } @@ -644,7 +687,8 @@ bool State::exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const return false; } -bool State::exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const { +bool State::exceeds_local_memory_limit(const Anderson2021Params ¶ms, + const Target &target) const { if (!target.has_gpu_feature()) { return false; } @@ -662,7 +706,12 @@ bool State::exceeds_local_memory_limit(const Anderson2021Params ¶ms, const T return false; } -bool State::calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) { +bool State::calculate_cost(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + CostModel *cost_model, + Statistics &stats, + bool verbose) { Timer timer; if (!root->has_valid_thread_extents()) { Filter(root.get()) << "Invalid thread extents\n"; @@ -778,7 +827,11 @@ void State::print_compute_locations() const { aslog(1) << "END compute locations\n"; } -void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents, const vector &constant_extents) const { +void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, + Stage &stage, + const vector ¶llel_vars, + const vector ¶llel_extents, + const vector &constant_extents) const { if (parallel_vars.empty() || parallel_extents.empty()) { return; } @@ -848,12 +901,16 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c } } -void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents) const { +void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, + Stage &stage, + const vector ¶llel_vars, + const vector ¶llel_extents) const { int max_blocks[3] = {2147483647, 65535, 65535}; uint8_t n_loops_tagged_gpu_blocks = 0; for (const auto &v : parallel_vars) { - if (n_loops_tagged_gpu_blocks >= 3 || parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) { + if (n_loops_tagged_gpu_blocks >= 3 || + parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) { break; } @@ -867,7 +924,10 @@ void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c } } -bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const { +bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, + Stage &stage, + std::unordered_set &new_serial_vars, + std::ostringstream &staged_funcs_schedule_source) const { uint8_t num_loops_tagged_gpu_thread = 0; int64_t total_threads = 1; int max_threads[3] = {1024, 1024, 64}; @@ -879,7 +939,9 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, continue; } - if (num_loops_tagged_gpu_thread >= 3 || total_threads >= MAX_THREADS_PER_BLOCK || v.extent > max_threads[num_loops_tagged_gpu_thread]) { + if (num_loops_tagged_gpu_thread >= 3 || + total_threads >= MAX_THREADS_PER_BLOCK || + v.extent > max_threads[num_loops_tagged_gpu_thread]) { break; } @@ -1147,7 +1209,10 @@ void State::apply_schedule(const FunctionDAG &dag, const Anderson2021Params &par } } - bool thread_loop_exists = mark_gpu_threads(p.second.get(), stage, new_serial_vars, staged_funcs_schedule_source); + bool thread_loop_exists = mark_gpu_threads(p.second.get(), + stage, + new_serial_vars, + staged_funcs_schedule_source); // The stage has no threads and no blocks. This is likely an update // stage where the reduction is a serial loop if (!thread_loop_exists && !has_enclosing_parallel) { diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h index 846c895a4c53..c2b0371dce3f 100644 --- a/src/autoschedulers/anderson2021/State.h +++ b/src/autoschedulers/anderson2021/State.h @@ -47,7 +47,10 @@ struct NoOpMutator { }; template -void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr &existing_loop_nest, const PostCreateMutator &post_create_mutator) { +void deep_copy_loop_nest(LoopNest *new_loop_nest, + const LoopNest *new_loop_nest_parent, + const IntrusivePtr &existing_loop_nest, + const PostCreateMutator &post_create_mutator) { new_loop_nest->copy_from(*existing_loop_nest); for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) { @@ -59,8 +62,11 @@ void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_ post_create_mutator(new_loop_nest); } +using LoopNestMap = map>; + template -LoopNest *deep_copy_loop_nest(const IntrusivePtr &loop_nest, const PostCreateMutator &post_create_mutator) { +LoopNest *deep_copy_loop_nest(const IntrusivePtr &loop_nest, + const PostCreateMutator &post_create_mutator) { LoopNest *new_loop_nest = new LoopNest; deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator); return new_loop_nest; @@ -86,11 +92,13 @@ struct State { uint64_t structural_hash(int depth) const; // Compute the parent and depth of every loop nest node - void compute_loop_nest_parents(map> &p, - const LoopNest *here, int depth) const; + void compute_loop_nest_parents(LoopNestMap &p, + const LoopNest *here, + int depth) const; - const LoopNest *deepest_common_ancestor(const map> &parent, - const LoopNest *a, const LoopNest *b) const; + const LoopNest *deepest_common_ancestor(const LoopNestMap &parent, + const LoopNest *a, + const LoopNest *b) const; // We use the post_create_mutator so that the loop nests can be modified // before they become IntrusivePtr as children and cannot be modified @@ -122,15 +130,27 @@ struct State { void add_outer_thread_loops(LoopNest *loop_nest) const; }; - IntrusivePtr get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const; + IntrusivePtr get_root_for_features(const Anderson2021Params ¶ms, + const Target &target) const; - void set_gpu_store_site(const map> &parent, const LoopNest *loop, LoopNest::Sites &site) const; + void set_gpu_store_site(const LoopNestMap &parent, + const LoopNest *loop, + LoopNest::Sites &site) const; - bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, StageMap *features, Statistics &stats, bool verbose = false) const; + bool compute_featurization(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + StageMap *features, + Statistics &stats, + bool verbose = false) const; - void save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const; + void save_featurization(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + std::ostream &out) const; - bool contains_store_at(const set &outermost_store_at, const IntrusivePtr &parent) const; + bool contains_store_at(const set &outermost_store_at, + const IntrusivePtr &parent) const; // For GPU, only allow store_at root or inside the outermost loop nest. Any // store_ats further in will be hoisted and expanded, increasing the @@ -141,13 +161,21 @@ struct State { bool exceeds_serial_extents_limit(const Target &target) const; - int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const; + int64_t get_shared_mem_alloc_size(const LoopNest *block, + const LoopNest *loop) const; - bool exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const; + bool exceeds_shared_memory_limit(const Anderson2021Params ¶ms, + const Target &target) const; - bool exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const; + bool exceeds_local_memory_limit(const Anderson2021Params ¶ms, + const Target &target) const; - bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false); + bool calculate_cost(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target, + CostModel *cost_model, + Statistics &stats, + bool verbose = false); // Make a child copy of this state. The loop nest is const (we // make mutated copies of it, rather than mutating it), so we can @@ -159,25 +187,43 @@ struct State { void print_compute_locations() const; - void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents, const vector &constant_extents) const; + void fuse_gpu_blocks(LoopNest::StageScheduleState *state, + Stage &stage, + const vector ¶llel_vars, + const vector ¶llel_extents, + const vector &constant_extents) const; - void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector ¶llel_vars, const vector ¶llel_extents) const; + void mark_gpu_blocks(LoopNest::StageScheduleState *state, + Stage &stage, + const vector ¶llel_vars, + const vector ¶llel_extents) const; - bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const; + bool mark_gpu_threads(LoopNest::StageScheduleState *state, + Stage &stage, + std::unordered_set &new_serial_vars, + std::ostringstream &staged_funcs_schedule_source) const; bool can_fuse_gpu(const vector ¶llel_extents) const; // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. - void apply_schedule(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target); + void apply_schedule(const FunctionDAG &dag, + const Anderson2021Params ¶ms, + const Target &target); bool should_always_consider_inline(const FunctionDAG::Node *node) const; void add_to_always_consider_inline_options(const FunctionDAG::Node *node); void update_always_consider_inline_options(const FunctionDAG::Node *node); - const LoopNest *deepest_valid_compute_location(const Anderson2021Params ¶ms, const map> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap &total_shared_mem_alloc_sizes) const; - int64_t total_loop_extents_of_ancestors(const map> &parent, const LoopNest *loop) const; + const LoopNest *deepest_valid_compute_location(const Anderson2021Params ¶ms, + const LoopNestMap &parent, + const FunctionDAG::Node &node, + const LoopNest *loop, + const LoopNest *root, + StageMap &total_shared_mem_alloc_sizes) const; + int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent, + const LoopNest *loop) const; }; // A priority queue of states, sorted according to increasing diff --git a/src/autoschedulers/anderson2021/Statistics.h b/src/autoschedulers/anderson2021/Statistics.h index a42717f75609..f725129d40ef 100644 --- a/src/autoschedulers/anderson2021/Statistics.h +++ b/src/autoschedulers/anderson2021/Statistics.h @@ -20,7 +20,8 @@ struct ScopedStatistic { std::string msg; ScopedStatistic(const T &value, const std::string &msg) - : value{value}, msg{msg} { + : value{value}, + msg{msg} { } ~ScopedStatistic() { @@ -33,7 +34,8 @@ struct ScopedTimer { std::string msg; explicit ScopedTimer(const std::string &msg) - : start{Clock::now()}, msg{msg} { + : start{Clock::now()}, + msg{msg} { aslog(1) << "Start: " << msg << "\n"; } diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp index 780151e6b9ec..4d4006d757e5 100644 --- a/src/autoschedulers/anderson2021/Tiling.cpp +++ b/src/autoschedulers/anderson2021/Tiling.cpp @@ -15,7 +15,8 @@ bool all_ones(const std::vector &nums) { return true; } -bool equal_to_existing_size(const std::vector &s, const std::vector &nums) { +bool equal_to_existing_size(const std::vector &s, + const std::vector &nums) { for (size_t i = 0; i < s.size(); ++i) { if (s[i] != nums[i]) { return false; @@ -24,7 +25,8 @@ bool equal_to_existing_size(const std::vector &s, const std::vector> generate_serial_tilings(const std::vector &s, int d, +std::vector> generate_serial_tilings(const std::vector &s, + int d, int last_d, int vectorized_index, const std::vector &vec_dim_serial_sizes, @@ -35,7 +37,13 @@ std::vector> generate_serial_tilings(const std::vector> v; - v = generate_serial_tilings(s, d - 1, last_d, vectorized_index, vec_dim_serial_sizes, filter_small_outer_extents, allow_inner_ones); + v = generate_serial_tilings(s, + d - 1, + last_d, + vectorized_index, + vec_dim_serial_sizes, + filter_small_outer_extents, + allow_inner_ones); for (auto t : v) { t.push_back(0); bool used_full_extent = false; @@ -90,7 +98,9 @@ std::vector> generate_serial_tilings(const std::vector> generate_tilings(const std::vector &s, int d, int factor, +std::vector> generate_tilings(const std::vector &s, + int d, + int factor, bool allow_splits, const std::vector &inner_sizes) { std::vector> result; @@ -199,7 +209,9 @@ std::vector> generate_tilings(const std::vector &s // Moves vectorized dimension first and also removes dimensions with size 1 // to reflect actual thread dimensions when loop nests are lowered -void lowered_dims(const std::vector &size, int vector_loop_i, std::vector &lowered_size) { +void lowered_dims(const std::vector &size, + int vector_loop_i, + std::vector &lowered_size) { if (vector_loop_i >= 0 && size[vector_loop_i] > 1) { lowered_size.push_back(size[vector_loop_i]); } @@ -238,12 +250,20 @@ std::vector> generate_gpu_tilings(const std::vector> v; - v = generate_gpu_tilings(stage_sizes, pure_dims, max_s, d - 1, vectorized_indices, serial_inner, is_compute_root_stage); + v = generate_gpu_tilings(stage_sizes, + pure_dims, + max_s, + d - 1, + vectorized_indices, + serial_inner, + is_compute_root_stage); for (auto t : v) { - enum validity { serial_count_err, - thread_count_err, - valid_tiling }; + enum validity { + serial_count_err, + thread_count_err, + valid_tiling + }; // helper function detects whether tiling is legal: cannot exceed max thread count, // have more than three dimensions with ext > 1, or result in large serial loops @@ -314,11 +334,14 @@ std::vector> generate_gpu_tilings(const std::vector max_threads_extent) || (d != vectorized_indices[0] && threads_ext > 16)) { + if ((d == vectorized_indices[0] && threads_ext > max_threads_extent) || + (d != vectorized_indices[0] && threads_ext > 16)) { break; } int64_t other_ext = (stage_sizes[0][d] + threads_ext - 1) / threads_ext; - if (d != vectorized_indices[0] && threads_ext > 1 && threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) { + if (d != vectorized_indices[0] && + threads_ext > 1 && + threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) { break; } t.back() = threads_ext; diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h index fb82672b2e06..b1e711f93ad0 100644 --- a/src/autoschedulers/anderson2021/Tiling.h +++ b/src/autoschedulers/anderson2021/Tiling.h @@ -10,9 +10,11 @@ namespace Autoscheduler { bool all_ones(const std::vector &nums); -bool equal_to_existing_size(const std::vector &s, const std::vector &nums); +bool equal_to_existing_size(const std::vector &s, + const std::vector &nums); -std::vector> generate_serial_tilings(const std::vector &s, int d, +std::vector> generate_serial_tilings(const std::vector &s, + int d, int last_d, int vectorized_index, const std::vector &vec_dim_serial_sizes, @@ -27,13 +29,17 @@ std::vector> generate_serial_tilings(const std::vector> generate_tilings(const std::vector &s, int d, int factor, +std::vector> generate_tilings(const std::vector &s, + int d, + int factor, bool allow_splits, const std::vector &inner_sizes = std::vector()); /** moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dimensions when loop nests are lowered **/ -void lowered_dims(const std::vector &size, int vector_loop_i, std::vector &lowered_size); +void lowered_dims(const std::vector &size, + int vector_loop_i, + std::vector &lowered_size); // creates tilings for gpu threads loops. // Innermost thread loop is always the vectorized dim and its extent is a multiple of 32. diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index 8a13dc176b37..6dfeb0dc62b5 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -43,7 +43,8 @@ struct ModelWeight : public GeneratorInput> { GeneratorOutput> grad; ModelWeight(const std::string &name, int dim) - : GeneratorInput>(name, dim), grad("updated_" + name, dim + 1) { + : GeneratorInput>(name, dim), + grad("updated_" + name, dim + 1) { } void backprop(const Derivative &d, Expr learning_rate, const Expr ×tep) { std::vector args(dimensions() + 1); diff --git a/src/autoschedulers/anderson2021/retrain_cost_model.cpp b/src/autoschedulers/anderson2021/retrain_cost_model.cpp index 89ef78bebff0..bb0b6ece7245 100644 --- a/src/autoschedulers/anderson2021/retrain_cost_model.cpp +++ b/src/autoschedulers/anderson2021/retrain_cost_model.cpp @@ -266,10 +266,9 @@ size_t load_samples(map &training_set, map worst_inversion.badness) { worst_inversion.pipeline_id = p.pipeline_id;