diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp
index 8ff7e3560799..8165979f90fb 100644
--- a/src/autoschedulers/anderson2021/AutoSchedule.cpp
+++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp
@@ -25,7 +25,9 @@
   value of HL_DEBUG_CODEGEN, if any).
 
   HL_PERMIT_FAILED_UNROLL
-  Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding.
+  Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent.
+  Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not
+  turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding.
 
 #ifdef HALIDE_AUTOSCHEDULER_ALLOW_CYOS
 
@@ -199,7 +201,15 @@ AutoSchedule::AutoSchedule(const FunctionDAG &dag,
                            Statistics &stats,
                            SearchSpace &search_space,
                            const LoopNestParser *partial_schedule)
-    : dag{dag}, params{params}, target{target}, outputs{outputs}, rng{rng}, cost_model{cost_model}, stats{stats}, search_space{search_space}, partial_schedule{partial_schedule} {
+    : dag{dag},
+      params{params},
+      target{target},
+      outputs{outputs},
+      rng{rng},
+      cost_model{cost_model},
+      stats{stats},
+      search_space{search_space},
+      partial_schedule{partial_schedule} {
     configure_pipeline_features(dag, params, cost_model);
 }
 
@@ -220,27 +230,26 @@ IntrusivePtr<State> AutoSchedule::optimal_schedule_pass(int beam_size,
 
     int expanded = 0;
 
-    std::function<void(IntrusivePtr<State> &&)> enqueue_new_children =
-        [&](IntrusivePtr<State> &&s) {
-            // aslog(1) << "\n** Generated child: ";
-            // s->dump();
-            // s->calculate_cost(dag, params, nullptr, true);
+    std::function<void(IntrusivePtr<State> &&)> enqueue_new_children = [&](IntrusivePtr<State> &&s) {
+        // aslog(1) << "\n** Generated child: ";
+        // s->dump();
+        // s->calculate_cost(dag, params, nullptr, true);
 
-            // Each child should have one more decision made than its parent state.
-            internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1);
+        // Each child should have one more decision made than its parent state.
+        internal_assert(s->num_decisions_made == s->parent->num_decisions_made + 1);
 
-            int progress = s->num_decisions_made * beam_size + expanded;
-            size_t max_progress = dag.nodes.size() * beam_size * 2;
+        int progress = s->num_decisions_made * beam_size + expanded;
+        size_t max_progress = dag.nodes.size() * beam_size * 2;
 
-            // Update the progress bar
-            tick.set(double(progress) / max_progress);
-            s->penalized = false;
+        // Update the progress bar
+        tick.set(double(progress) / max_progress);
+        s->penalized = false;
 
-            ++stats.num_states_added;
+        ++stats.num_states_added;
 
-            // Add the state to the list of states to evaluate
-            q.emplace(std::move(s));
-        };
+        // Add the state to the list of states to evaluate
+        q.emplace(std::move(s));
+    };
 
     std::unique_ptr<LoopNestParser> target_loop_nest;
 
@@ -600,7 +609,15 @@ void generate_schedule(const std::vector<Function> &outputs,
     std::mt19937 rng{(uint32_t)params.random_dropout_seed};
     SearchSpace search_space{dag, params, target, rng, cost_model.get(), stats, partial_schedule.get()};
 
-    AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model.get(), stats, search_space, partial_schedule.get()};
+    AutoSchedule autoschedule{dag,
+                              params,
+                              target,
+                              outputs,
+                              rng,
+                              cost_model.get(),
+                              stats,
+                              search_space,
+                              partial_schedule.get()};
 
     // Run beam search
     optimal = autoschedule.optimal_schedule(params.beam_size);
@@ -656,7 +673,8 @@ void generate_schedule(const std::vector<Function> &outputs,
     aslog(1) << "Total cost model evaluation time (ms): " << stats.total_cost_model_evaluation_time() << "\n";
     aslog(1) << "Average cost model evaluation time (ms): " << stats.average_cost_model_evaluation_time() << "\n";
     std::chrono::duration<double> total_time = timer.elapsed();
-    aslog(1) << "Time taken for autoscheduler (s): " << std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count() / 1000.0 << '\n';
+    aslog(1) << "Time taken for autoscheduler (s): "
+             << std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count() / 1000.0 << '\n';
 }
 
 struct Anderson2021 {
@@ -717,7 +735,15 @@ void find_and_apply_schedule(FunctionDAG &dag,
     }
 
     SearchSpace search_space{dag, params, target, rng, cost_model, stats, partial_schedule.get()};
-    AutoSchedule autoschedule{dag, params, target, outputs, rng, cost_model, stats, search_space, partial_schedule.get()};
+    AutoSchedule autoschedule{dag,
+                              params,
+                              target,
+                              outputs,
+                              rng,
+                              cost_model,
+                              stats,
+                              search_space,
+                              partial_schedule.get()};
 
     IntrusivePtr<State> optimal = autoschedule.optimal_schedule(beam_size);
 
diff --git a/src/autoschedulers/anderson2021/DefaultCostModel.cpp b/src/autoschedulers/anderson2021/DefaultCostModel.cpp
index 3eede5993d98..51bf21f21780 100644
--- a/src/autoschedulers/anderson2021/DefaultCostModel.cpp
+++ b/src/autoschedulers/anderson2021/DefaultCostModel.cpp
@@ -51,8 +51,7 @@ void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::Func
     const int pipeline_feat_size = head1_w * head1_h;
     // We ignore the first seven pipeline features in the cost
     // model. It's just a mask of which types are in use.
-    static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) ==
-                      sizeof(int) * pipeline_feat_size,
+    static_assert(sizeof(PipelineFeatures) - 7 * sizeof(int) == sizeof(int) * pipeline_feat_size,
                   "Incorrect size for pipeline features");
     int num_stages = 0;
     for (const auto &n : dag.nodes) {
@@ -231,15 +230,22 @@ float DefaultCostModel::backprop(const Runtime::Buffer<const float> &true_runtim
                                   batch_id,
                                   pipeline_feat_queue,
                                   schedule_feat_queue,
-                                  weights.head1_filter, weights.head1_bias,
-                                  weights.head2_filter, weights.head2_bias,
-                                  weights.conv1_filter, weights.conv1_bias,
-                                  learning_rate, timestep++,
+                                  weights.head1_filter,
+                                  weights.head1_bias,
+                                  weights.head2_filter,
+                                  weights.head2_bias,
+                                  weights.conv1_filter,
+                                  weights.conv1_bias,
+                                  learning_rate,
+                                  timestep++,
                                   fastest_idx,
                                   true_runtimes.alias(),
-                                  head1_filter_update, head1_bias_update,
-                                  head2_filter_update, head2_bias_update,
-                                  conv1_filter_update, conv1_bias_update,
+                                  head1_filter_update,
+                                  head1_bias_update,
+                                  head2_filter_update,
+                                  head2_bias_update,
+                                  conv1_filter_update,
+                                  conv1_bias_update,
                                   dst,
                                   dst_costs_per_stage,
                                   loss);
diff --git a/src/autoschedulers/anderson2021/FunctionDAG.cpp b/src/autoschedulers/anderson2021/FunctionDAG.cpp
index e4dae392d13d..1a057187dcbd 100644
--- a/src/autoschedulers/anderson2021/FunctionDAG.cpp
+++ b/src/autoschedulers/anderson2021/FunctionDAG.cpp
@@ -239,10 +239,10 @@ class Featurizer : public IRVisitor {
     void visit_memory_access(const std::string &name, Type t, const vector<Expr> &args, PipelineFeatures::AccessType type) {
         // Compute matrix of partial derivatives of args w.r.t. loop params
         LoadJacobian matrix(args.size(), stage.loop.size(), 1);
-        vector<size_t> ones_per_row(args.size(), 0),
-            zeros_per_row(args.size(), 0),
-            ones_per_col(stage.loop.size(), 0),
-            zeros_per_col(stage.loop.size(), 0);
+        vector<size_t> ones_per_row(args.size(), 0);
+        vector<size_t> zeros_per_row(args.size(), 0);
+        vector<size_t> ones_per_col(stage.loop.size(), 0);
+        vector<size_t> zeros_per_col(stage.loop.size(), 0);
         bool is_pointwise = args.size() == stage.loop.size();
         for (size_t i = 0; i < args.size(); i++) {
             for (size_t j = 0; j < stage.loop.size(); j++) {
@@ -295,7 +295,8 @@ class Featurizer : public IRVisitor {
 
 public:
     Featurizer(Function &func, FunctionDAG::Node::Stage &stage)
-        : func(func), stage(stage) {
+        : func(func),
+          stage(stage) {
     }
 
     void visit_store_args(const std::string &name, Type t, vector<Expr> args) {
diff --git a/src/autoschedulers/anderson2021/FunctionDAG.h b/src/autoschedulers/anderson2021/FunctionDAG.h
index ef7e57651462..4e08a917f8ca 100644
--- a/src/autoschedulers/anderson2021/FunctionDAG.h
+++ b/src/autoschedulers/anderson2021/FunctionDAG.h
@@ -39,7 +39,8 @@ struct OptionalRational {
 
     OptionalRational() = default;
     OptionalRational(int64_t n, int64_t d)
-        : numerator(n), denominator(d) {
+        : numerator(n),
+          denominator(d) {
     }
 
     void operator+=(const OptionalRational &other) {
@@ -137,7 +138,9 @@ class LoadJacobian {
 
 public:
     LoadJacobian(size_t producer_storage_dims, size_t consumer_loop_dims, int64_t count)
-        : c(count), rows(producer_storage_dims), cols(consumer_loop_dims) {
+        : c(count),
+          rows(producer_storage_dims),
+          cols(consumer_loop_dims) {
         coeffs.resize(rows * cols);
     }
 
@@ -283,7 +286,9 @@ class Span {
     }
 
     Span(int64_t a, int64_t b, bool c)
-        : min_(a), max_(b), constant_extent_(c) {
+        : min_(a),
+          max_(b),
+          constant_extent_(c) {
     }
     Span() = default;
     Span(const Span &other) = default;
diff --git a/src/autoschedulers/anderson2021/GPUMemInfo.h b/src/autoschedulers/anderson2021/GPUMemInfo.h
index 7140a3be0ba1..40c000e1b8c1 100644
--- a/src/autoschedulers/anderson2021/GPUMemInfo.h
+++ b/src/autoschedulers/anderson2021/GPUMemInfo.h
@@ -175,7 +175,10 @@ struct Strides {
 
 struct GlobalAccessAccumulator {
     GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
-        : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
+        : bytes_per_access{bytes_per_access},
+          dimensions{dimensions},
+          strides{strides},
+          verbose{verbose} {
     }
 
     void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
@@ -257,7 +260,10 @@ struct GlobalAccessAccumulator {
 
 struct SharedAccessAccumulator {
     SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
-        : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
+        : bytes_per_access{bytes_per_access},
+          dimensions{dimensions},
+          strides{strides},
+          verbose{verbose} {
     }
 
     void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
@@ -348,7 +354,8 @@ struct SharedAccessAccumulator {
 
 struct LocalAccessAccumulator {
     LocalAccessAccumulator(int bytes_per_access, bool verbose)
-        : bytes_per_access{bytes_per_access}, verbose{verbose} {
+        : bytes_per_access{bytes_per_access},
+          verbose{verbose} {
     }
 
     void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp
index be0209affb38..04850bafe633 100644
--- a/src/autoschedulers/anderson2021/LoopNest.cpp
+++ b/src/autoschedulers/anderson2021/LoopNest.cpp
@@ -176,7 +176,13 @@ bool LoopNest::add_gpu_thread_tilings(const FunctionDAG::Node *f,
     vector<int> vectorized_indices;
     this->get_stage_sizes(f, stage_sizes, pure_dims, vectorized_indices);
     internal_assert(!stage_sizes.empty());
-    auto tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, (int)(stage_sizes[0].size() - 1), vectorized_indices, true, false);
+    auto tilings = generate_gpu_tilings(stage_sizes,
+                                        pure_dims,
+                                        max_size,
+                                        (int)(stage_sizes[0].size() - 1),
+                                        vectorized_indices,
+                                        true,
+                                        false);
     bool made_child = false;
     for (const auto &t : tilings) {
         LoopNest *new_parent = new LoopNest;
@@ -309,7 +315,9 @@ GPUMemoryType LoopNest::get_gpu_memory_type(bool in_block, bool in_thread, bool
     return GPUMemoryType::Global;
 }
 
-std::vector<int> LoopNest::unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const {
+std::vector<int> LoopNest::unrolled_loops(const Target &target,
+                                          const LoopNest *parent,
+                                          const LoopNest *grandparent) const {
     internal_assert(innermost);
     const auto &grandparent_bounds = grandparent->get_bounds(node);
     std::vector<int> unrolled(parent->size.size(), 0);
@@ -358,14 +366,14 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target
                                                             NodeMap<bool> &can_be_promoted_to_registers,
                                                             const LoopNest *grandparent,
                                                             const LoopNest *parent) const {
-
     for (const auto *alloc_node : store_at) {
         const auto &store_site = sites.get(&alloc_node->stages[0]);
         if (store_site.gpu_store_memory_type != GPUMemoryType::Local) {
             continue;
         }
 
-        can_be_promoted_to_registers.get_or_create(alloc_node) = store_site.is_constant_allocation && store_site.allocation_size <= get_register_mem_alloc_limit();
+        can_be_promoted_to_registers.get_or_create(alloc_node) = store_site.is_constant_allocation &&
+                                                                 store_site.allocation_size <= get_register_mem_alloc_limit();
     }
 
     for (const auto &c : children) {
@@ -380,7 +388,8 @@ void LoopNest::get_allocs_that_can_be_promoted_to_registers(const Target &target
                 continue;
             }
 
-            can_be_promoted_to_registers.get(e->producer) = can_be_promoted_to_registers.get(e->producer) && accessed_at_constant_indices(unrolled, e);
+            can_be_promoted_to_registers.get(e->producer) = can_be_promoted_to_registers.get(e->producer) &&
+                                                            accessed_at_constant_indices(unrolled, e);
         }
     }
 }
@@ -578,7 +587,11 @@ int LoopNest::get_vectorized_loop_index_from_pure_stage(const LoopNest &root) co
 
 // Get the stride over "node's" storage for a unit increment in the vectorized loop's
 // index
-double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const {
+double LoopNest::storage_stride(const LoadJacobian &jac,
+                                int innermost_storage_dim,
+                                const FunctionDAG::Node *storage_node,
+                                const Bound &store_bounds,
+                                const LoopNest &root) const {
     internal_assert(innermost_storage_dim >= 0);
 
     // The node's storage dimensions (from innermost outward)
@@ -613,7 +626,10 @@ double LoopNest::storage_stride(const LoadJacobian &jac, int innermost_storage_d
 }
 
 // Shared mem accesses with stride 1 will likely be vectorized
-bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const {
+bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac,
+                                                      const FunctionDAG::Node *accessed,
+                                                      int innermost_dim,
+                                                      int loop_index) const {
     for (int i = 0; i < accessed->dimensions; i++) {
         auto stride = jac(i, loop_index);
         if (i == innermost_dim) {
@@ -628,7 +644,12 @@ bool LoopNest::can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, c
     return true;
 }
 
-bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const {
+bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac,
+                                          const FunctionDAG::Node *accessed,
+                                          bool accessed_has_been_scheduled,
+                                          int innermost_dim,
+                                          int loop_index,
+                                          const GPUMemoryType &mem_type) const {
     if (loop_index < 0 || mem_type != GPUMemoryType::Shared) {
         return false;
     }
@@ -637,7 +658,12 @@ bool LoopNest::can_vectorize_store_access(const LoadJacobian &jac, const Functio
     return can_vectorize_access_for_innermost_dim(jac, accessed, innermost_dim, loop_index);
 }
 
-int LoopNest::vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose) const {
+int LoopNest::vectorized_load_access_size(const LoadJacobian &jac,
+                                          const FunctionDAG::Node *accessed,
+                                          bool accessed_has_been_scheduled,
+                                          int innermost_dim,
+                                          const GPUMemoryType &mem_type,
+                                          bool verbose) const {
     int vector_size = 1;
     if (mem_type != GPUMemoryType::Shared) {
         return vector_size;
@@ -704,6 +730,7 @@ int LoopNest::vectorized_access_size(size_t loop_index, bool verbose) const {
 
     return 1;
 }
+
 double LoopNest::compute_local_mem_stride(double stride, double bytes) const {
     // Each word is 4 bytes so adjust the stride based
     // on width of data being accessed
@@ -718,11 +745,18 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {
 
 // Get the stride over "node's" storage and its element-wise stride for a unit
 // increment in the given thread loops
-Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const {
+Strides LoopNest::compute_strides(const LoadJacobian &jac,
+                                  int innermost_storage_dim,
+                                  const FunctionDAG::Node *storage_node,
+                                  const Bound &store_bounds,
+                                  const ThreadInfo *thread_info,
+                                  bool verbose) const {
     internal_assert(innermost_storage_dim >= 0);
 
     if (verbose) {
-        aslog(2) << "\nstrides: " << node->func.name() << " (stage = " << stage->index << ") loading from " << storage_node->func.name() << " ->\n";
+        aslog(2) << "\nstrides: " << node->func.name() << " (stage = "
+                 << stage->index << ") loading from "
+                 << storage_node->func.name() << " ->\n";
         if (aslog::aslog_level() >= 2) {
             jac.dump("");
         }
@@ -811,7 +845,9 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
     return strides;
 }
 
-bool LoopNest::all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const {
+bool LoopNest::all_strides_exist(const LoadJacobian &jac,
+                                 const FunctionDAG::Node *storage_node,
+                                 const LoopNest &root) const {
     int v = get_vectorized_loop_index_from_pure_stage(root);
 
     for (int i = 0; i < storage_node->dimensions; i++) {
@@ -838,7 +874,20 @@ int LoopNest::get_actual_vector_dim(const Bound &store_bounds) const {
     return vector_dim;
 }
 
-void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector<int64_t> &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose) const {
+void LoopNest::compute_gpu_store_features(const LoadJacobian &jac,
+                                          int consumer_innermost_dim,
+                                          const FunctionDAG::Node *node,
+                                          const Bound &consumer_store_bounds,
+                                          const GPULoopInfo &gpu_loop_info,
+                                          const std::vector<int64_t> &inner_serial_loop_extents,
+                                          const Sites &consumer_site,
+                                          ScheduleFeatures &feat,
+                                          const LoopNest *parent,
+                                          const LoopNest &root,
+                                          GlobalMemInfo &global_mem_loads,
+                                          SharedMemInfo &shared_mem_loads,
+                                          LocalMemInfo &local_mem_loads,
+                                          bool verbose) const {
     if (consumer_site.is_stored_in_registers()) {
         return;
     }
@@ -906,7 +955,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         }
         feat.shared_mem_store_efficiency = shared_mem_info.efficiency();
 
-        internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency)) << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency << " for " << node->func.name();
+        internal_assert(in_range_zero_one(feat.shared_mem_store_efficiency))
+            << "Invalid shared mem store efficiency: " << feat.shared_mem_store_efficiency
+            << " for " << node->func.name();
 
     } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Global) {
         if (verbose) {
@@ -928,7 +979,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         }
         feat.global_mem_store_efficiency = global_mem_info.efficiency();
 
-        internal_assert(in_range_zero_one(feat.global_mem_store_efficiency)) << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency << " for " << node->func.name();
+        internal_assert(in_range_zero_one(feat.global_mem_store_efficiency))
+            << "Invalid global mem store efficiency: " << feat.global_mem_store_efficiency
+            << " for " << node->func.name();
 
     } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) {
         auto local_mem_info = compute_mem_store_info<LocalMem>(
@@ -945,7 +998,9 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         }
         // feat.local_mem_store_efficiency = local_mem_info.efficiency();
 
-        // internal_assert(in_range_zero_one(feat.local_mem_store_efficiency)) << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency << " for " << node->func.name();
+        // internal_assert(in_range_zero_one(feat.local_mem_store_efficiency))
+        //     << "Invalid local mem store coalesce efficiency: " << feat.local_mem_store_efficiency
+        //     << " for " << node->func.name();
     }
 
     if (verbose) {
@@ -959,7 +1014,11 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
         } else if (consumer_site.gpu_store_memory_type == GPUMemoryType::Local) {
             mem_type = "local";
         }
-        aslog(2) << "END MEM ACCESS " << mem_type << "_mem_" << type << ". consumer: " << consumer_name << "_s" << stage->index << "; producer: " << consumer_name;
+        aslog(2) << "END MEM ACCESS "
+                 << mem_type << "_mem_" << type
+                 << ". consumer: " << consumer_name
+                 << "_s" << stage->index
+                 << "; producer: " << consumer_name;
         if (!jac.all_coeffs_exist()) {
             aslog(2) << " (not all coeffs exist)";
         }
@@ -968,7 +1027,14 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
 }
 
 template<typename T>
-void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
+void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac,
+                                                  const FunctionDAG::Node *node,
+                                                  const Bound &store_bounds,
+                                                  const ThreadInfo *thread_info,
+                                                  int innermost_dim,
+                                                  double num_requests_per_warp,
+                                                  MemInfoType<T> &mem_info,
+                                                  bool verbose) const {
     int bytes_per_access = node->bytes_per_point;
 
     // If the consumer is a scalar and is compute_root, then it will not be
@@ -992,10 +1058,7 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
         Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
         thread_info->for_each_thread_id_in_first_warp(accumulator);
 
-        accumulator.add_access_info(
-            num_requests,
-            mem_info,
-            false);
+        accumulator.add_access_info(num_requests, mem_info, false);
 
         if (verbose) {
             aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
@@ -1015,22 +1078,40 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
     Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
     thread_info->for_each_thread_id_in_tail_warp(accumulator);
 
-    accumulator.add_access_info(
-        num_requests_per_warp,
-        mem_info,
-        true);
+    accumulator.add_access_info(num_requests_per_warp, mem_info, true);
 
     if (verbose) {
         aslog(2) << "END tail warp\n\n";
     }
 }
 
-template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
-
-template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
+template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac,
+                                                                      const FunctionDAG::Node *node,
+                                                                      const Bound &store_bounds,
+                                                                      const ThreadInfo *thread_info,
+                                                                      int innermost_dim,
+                                                                      double num_requests_per_warp,
+                                                                      MemInfoType<GlobalMem> &mem_info,
+                                                                      bool verbose) const;
+
+template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac,
+                                                                      const FunctionDAG::Node *node,
+                                                                      const Bound &store_bounds,
+                                                                      const ThreadInfo *thread_info,
+                                                                      int innermost_dim,
+                                                                      double num_requests_per_warp,
+                                                                      MemInfoType<SharedMem> &mem_info,
+                                                                      bool verbose) const;
 
 template<>
-void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
+void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac,
+                                                            const FunctionDAG::Node *node,
+                                                            const Bound &store_bounds,
+                                                            const ThreadInfo *thread_info,
+                                                            int innermost_dim,
+                                                            double num_requests_per_warp,
+                                                            MemInfoType<LocalMem> &mem_info,
+                                                            bool verbose) const {
     int bytes_per_access = node->bytes_per_point;
 
     // If the consumer is a scalar and is compute_root, then it will not be
@@ -1047,10 +1128,7 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &
         LocalAccessAccumulator accumulator(bytes_per_access, verbose);
         thread_info->for_each_thread_id_in_first_warp(accumulator);
 
-        accumulator.add_access_info(
-            num_requests,
-            mem_info,
-            false);
+        accumulator.add_access_info(num_requests, mem_info, false);
 
         if (verbose) {
             aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
@@ -1070,17 +1148,20 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &
     LocalAccessAccumulator accumulator(bytes_per_access, verbose);
     thread_info->for_each_thread_id_in_tail_warp(accumulator);
 
-    accumulator.add_access_info(
-        num_requests_per_warp,
-        mem_info,
-        true);
+    accumulator.add_access_info(num_requests_per_warp, mem_info, true);
 
     if (verbose) {
         aslog(2) << "END tail warp\n\n";
     }
 }
 
-std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const {
+std::pair<double, double>
+LoopNest::compute_local_mem_store_features(const LoadJacobian &jac,
+                                           int consumer_innermost_dim,
+                                           const FunctionDAG::Node *node,
+                                           const Bound &consumer_store_bounds,
+                                           const LoopNest &root,
+                                           double serial_loop_extents) const {
     // Assume worst case serialized loads if the stride is unknown
     if (!all_strides_exist(jac, node, root)) {
         double stride = compute_local_mem_stride(32.0, node->bytes_per_point);
@@ -1095,21 +1176,60 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
 }
 
 template<typename T>
-MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const {
+MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac,
+                                                int consumer_innermost_dim,
+                                                const FunctionDAG::Node *node,
+                                                const Bound &consumer_store_bounds,
+                                                const ThreadInfo *thread_info,
+                                                double serial_loop_extents,
+                                                bool verbose) const {
     MemInfoType<T> mem_info;
 
-    compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
+    compute_num_mem_accesses_per_block<T>(jac,
+                                          node,
+                                          consumer_store_bounds,
+                                          thread_info,
+                                          consumer_innermost_dim,
+                                          serial_loop_extents,
+                                          mem_info, verbose);
     return mem_info;
 }
 
-template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
-
-template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
+template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac,
+                                                                            int consumer_innermost_dim,
+                                                                            const FunctionDAG::Node *node,
+                                                                            const Bound &consumer_store_bounds,
+                                                                            const ThreadInfo *thread_info,
+                                                                            double serial_loop_extents,
+                                                                            bool verbose) const;
+
+template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac,
+                                                                            int consumer_innermost_dim,
+                                                                            const FunctionDAG::Node *node,
+                                                                            const Bound &consumer_store_bounds,
+                                                                            const ThreadInfo *thread_info,
+                                                                            double serial_loop_extents,
+                                                                            bool verbose) const;
 
 template<typename T>
-void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
+void LoopNest::compute_mem_load_features(const LoadJacobian &jac,
+                                         int producer_innermost_dim,
+                                         const FunctionDAG::Node *node,
+                                         const Bound &producer_store_bounds,
+                                         bool producer_has_been_scheduled,
+                                         const ThreadInfo *thread_info,
+                                         MemInfoType<T> &mem_info,
+                                         double points_accessed_per_thread,
+                                         bool verbose) const {
     if (producer_has_been_scheduled) {
-        compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);
+        compute_num_mem_accesses_per_block<T>(jac,
+                                              node,
+                                              producer_store_bounds,
+                                              thread_info,
+                                              producer_innermost_dim,
+                                              points_accessed_per_thread,
+                                              mem_info,
+                                              verbose);
 
         return;
     }
@@ -1121,7 +1241,14 @@ void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_i
 
     for (int i = 0; i < node->dimensions; i++) {
         MemInfoType<T> info;
-        compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, i, points_accessed_per_thread, info, verbose);
+        compute_num_mem_accesses_per_block<T>(jac,
+                                              node,
+                                              producer_store_bounds,
+                                              thread_info,
+                                              i,
+                                              points_accessed_per_thread,
+                                              info,
+                                              verbose);
         if (i == 0 || info.num_transactions() < min_required_accesses) {
             min_info = info;
             min_required_accesses = info.num_transactions();
@@ -1161,7 +1288,14 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
                                                    MemInfoType<LocalMem> &mem_info,
                                                    double points_accessed_per_thread,
                                                    bool verbose) const {
-    compute_num_mem_accesses_per_block<LocalMem>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);
+    compute_num_mem_accesses_per_block<LocalMem>(jac,
+                                                 node,
+                                                 producer_store_bounds,
+                                                 thread_info,
+                                                 producer_innermost_dim,
+                                                 points_accessed_per_thread,
+                                                 mem_info,
+                                                 verbose);
 }
 
 // Assumes block, serial, thread or block, thread nesting
@@ -1176,7 +1310,8 @@ const LoopNest *LoopNest::get_enclosing_block(const LoopNest *parent, const Loop
         return grandparent;
     }
 
-    internal_error << "Invalid nesting: " << stringify(parent->gpu_label) << ", " << stringify(grandparent->gpu_label) << "\n";
+    internal_error << "Invalid nesting: " << stringify(parent->gpu_label) << ", " << stringify(grandparent->gpu_label)
+                   << "\n";
     return nullptr;
 }
 
@@ -1252,12 +1387,16 @@ void LoopNest::compute_warp_features(ScheduleFeatures &features, const GPULoopIn
     features.block_occupancy = thread_info->block_occupancy();
     features.num_threads_per_block = thread_info->num_threads;
 
-    internal_assert(in_range_zero_one(features.block_occupancy)) << "Invalid block occupancy: " << features.block_occupancy;
-    internal_assert(in_range_zero_one(features.warp_lane_utilization)) << "Invalid warp utilization: " << features.warp_lane_utilization;
+    internal_assert(in_range_zero_one(features.block_occupancy))
+        << "Invalid block occupancy: " << features.block_occupancy;
+    internal_assert(in_range_zero_one(features.warp_lane_utilization))
+        << "Invalid warp utilization: " << features.warp_lane_utilization;
 }
 
 // Assume that when a block is active, all its warps are active
-void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const {
+void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params,
+                                                ScheduleFeatures &feat,
+                                                const GPULoopInfo &gpu_loop_info) const {
     // Only compute these features for stage's that actually have a block
     // loop
     if (node != gpu_loop_info.current_block_loop->node) {
@@ -1282,7 +1421,10 @@ void LoopNest::compute_warp_and_block_occupancy(const Anderson2021Params &params
     feat.max_block_occupancy = (double)max_active_blocks / (double)active_block_hardware_limit;
 }
 
-void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const {
+void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params &params,
+                                            const Target &target,
+                                            int64_t total_shared_mem_alloc_size,
+                                            ScheduleFeatures &feat) const {
     if (!is_gpu_block(target)) {
         return;
     }
@@ -1295,10 +1437,12 @@ void LoopNest::compute_shared_mem_occupancy(const Anderson2021Params &params, co
     internal_assert(feat.shared_mem_occupancy <= 1) << "Invalid shared mem occupancy: " << feat.shared_mem_occupancy;
 
     if (total_shared_mem_alloc_size > 0) {
-        auto shared_mem_max_active_blocks = std::min(active_block_hardware_limit, shared_mem_sm_limit / total_shared_mem_alloc_size);
+        auto shared_mem_max_active_blocks = std::min(active_block_hardware_limit,
+                                                     shared_mem_sm_limit / total_shared_mem_alloc_size);
         feat.shared_mem_block_limit_factor = (double)shared_mem_max_active_blocks / (double)active_block_hardware_limit;
 
-        internal_assert(feat.shared_mem_block_limit_factor <= 1) << "Invalid shared mem block limit factor: " << feat.shared_mem_block_limit_factor;
+        internal_assert(feat.shared_mem_block_limit_factor <= 1)
+            << "Invalid shared mem block limit factor: " << feat.shared_mem_block_limit_factor;
     }
 }
 
@@ -1329,20 +1473,21 @@ std::pair<const LoopNest *, const LoopNest *> LoopNest::find_innermost_and_paren
     return {child, parent};
 }
 
-int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
-                                             const Target &target,
-                                             const GPULoopInfo &gpu_loop_info,
-                                             const std::vector<const FunctionDAG::Edge *> &edge_chain,
-                                             const LoadJacobian &jac,
-                                             const LoopNest *parent,
-                                             const LoopNest *grandparent,
-                                             int64_t n,
-                                             const ScheduleFeatures &feat,
-                                             const LoadJacobian &serial_jac,
-                                             bool producer_has_been_scheduled,
-                                             int producer_innermost_dim,
-                                             const GPUMemoryType &mem_type,
-                                             bool verbose) const {
+int64_t LoopNest::points_accessed_per_thread(
+    const Anderson2021Params &params,
+    const Target &target,
+    const GPULoopInfo &gpu_loop_info,
+    const std::vector<const FunctionDAG::Edge *> &edge_chain,
+    const LoadJacobian &jac,
+    const LoopNest *parent,
+    const LoopNest *grandparent,
+    int64_t n,
+    const ScheduleFeatures &feat,
+    const LoadJacobian &serial_jac,
+    bool producer_has_been_scheduled,
+    int producer_innermost_dim,
+    const GPUMemoryType &mem_type,
+    bool verbose) const {
 
     std::unique_ptr<LoopNest> innermost_parent_clone = std::make_unique<LoopNest>();
     innermost_parent_clone->copy_from(*parent);
@@ -1406,7 +1551,15 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
         }
     }
 
-    IntrusivePtr<const LoopNest> innermost_parent = innermost_parent_clone->parallelize_in_tiles(tiling, grandparent, params, target, true, false, false, rvars_to_move_inward);
+    IntrusivePtr<const LoopNest> innermost_parent = innermost_parent_clone->parallelize_in_tiles(
+        tiling,
+        grandparent,
+        params,
+        target,
+        true,
+        false,
+        false,
+        rvars_to_move_inward);
 
     const auto &bounds = innermost_parent->get_bounds_along_edge_chain(producer, edge_chain);
     int64_t num_points = 1;
@@ -1424,7 +1577,8 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
     }
 
     // There are 2 ways to calculate the number of points accessed:
-    // 1. The region_required of the producer in the non-LICM unrolled loops * the loop extents of the non-LICM loops that cannot be unrolled
+    // 1. The region_required of the producer in the non-LICM unrolled loops * the loop extents of the non-LICM loops
+    // that cannot be unrolled
     int64_t points_accessed_by_region_required = num_points * product_of_non_licm_non_unrolled_extents;
 
     // 2. The number of points computed according to 'n' (the number of
@@ -1443,13 +1597,12 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
         points_accessed = points_accessed_by_loop_extents;
 
         if (mem_type == GPUMemoryType::Shared) {
-            int vector_size = parent->vectorized_load_access_size(
-                serial_jac,
-                producer,
-                producer_has_been_scheduled,
-                producer_innermost_dim,
-                mem_type,
-                verbose);
+            int vector_size = parent->vectorized_load_access_size(serial_jac,
+                                                                  producer,
+                                                                  producer_has_been_scheduled,
+                                                                  producer_innermost_dim,
+                                                                  mem_type,
+                                                                  verbose);
 
             if (verbose) {
                 aslog(2) << "\n";
@@ -1467,7 +1620,8 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
 
     points_accessed *= gpu_loop_info.total_outer_serial_extents;
 
-    int64_t total_inner_serial_extents_outside_realization = gpu_loop_info.get_total_inner_serial_extents_outside_realization(this);
+    int64_t total_inner_serial_extents_outside_realization =
+        gpu_loop_info.get_total_inner_serial_extents_outside_realization(this);
 
     // If you have a realization inside a serial loop e.g.
     // f 80 gpu_block
@@ -1497,7 +1651,11 @@ int64_t LoopNest::points_accessed_per_thread(const Anderson2021Params &params,
     return points_accessed;
 }
 
-int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const {
+int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost,
+                                            const LoopNest *parent,
+                                            const ScheduleFeatures &feat,
+                                            const LoadJacobian &jac,
+                                            int producer_dims) const {
     // Is this load loop-invariant over an
     // unrolled block? If so, we amortize the
     // number of loads to account for LICM.
@@ -1526,7 +1684,8 @@ int64_t LoopNest::compute_licm_amortization(const LoopNest *innermost, const Loo
     return amortization;
 }
 
-void LoopNest::memoize_points_computed_minimum(StageMap<ScheduleFeatures> &memoized_features, const StageMap<ScheduleFeatures> *features) const {
+void LoopNest::memoize_points_computed_minimum(StageMap<ScheduleFeatures> &memoized_features,
+                                               const StageMap<ScheduleFeatures> *features) const {
     for (auto it = inlined.begin(); it != inlined.end(); it++) {
         const auto *f = it.key();
         const auto &inlined_feat = features->get(&(f->stages[0]));
@@ -1565,9 +1724,7 @@ vector<pair<int, int>> LoopNest::collect_producers(const StageMap<Sites> &sites)
         done.insert(e->producer);
         const auto &site = sites.get(&(e->producer->stages[0]));
         if (site.store->is_root()) {
-            int vector_dim = (e->producer->is_input   ? 0 :
-                              site.produce != nullptr ? site.produce->vector_dim :
-                                                        -1);
+            int vector_dim = (e->producer->is_input ? 0 : (site.produce != nullptr ? site.produce->vector_dim : -1));
             producers.emplace_back(e->producer->id, vector_dim);
         } else if (site.produce != nullptr) {
             // Computation must be nested inside this task or inlined into it.
@@ -1586,9 +1743,8 @@ uint64_t LoopNest::compute_hash_of_producers_stored_at_root(const StageMap<Sites
     vector<pair<int, int>> producers = collect_producers(sites);
 
     // Sort them according to node id
-    std::sort(producers.begin(), producers.end(), [](const pair<int, int> &a, const pair<int, int> &b) {
-        return a.first < b.first;
-    });
+    std::sort(producers.begin(), producers.end(),
+              [](const pair<int, int> &a, const pair<int, int> &b) { return a.first < b.first; });
 
     uint64_t store_root_hash = 0;
     for (const auto &p : producers) {
@@ -1607,7 +1763,8 @@ void LoopNest::collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages
     }
 }
 
-void LoopNest::memoize_features(StageMap<ScheduleFeatures> &memoized_features, const StageMap<ScheduleFeatures> *features) const {
+void LoopNest::memoize_features(StageMap<ScheduleFeatures> &memoized_features,
+                                const StageMap<ScheduleFeatures> *features) const {
     for (auto it = inlined.begin(); it != inlined.end(); it++) {
         const auto *f = it.key();
         if (memoized_features.contains(&(f->stages[0]))) {
@@ -1643,7 +1800,8 @@ void LoopNest::compute_working_set_from_features(int64_t *working_set,
     *working_set += working_set_here;
 }
 
-void LoopNest::recompute_inlined_features(const StageMap<Sites> &sites, StageMap<ScheduleFeatures> *features) const {
+void LoopNest::recompute_inlined_features(const StageMap<Sites> &sites,
+                                          StageMap<ScheduleFeatures> *features) const {
     for (const auto &c : children) {
         c->recompute_inlined_features(sites, features);
     }
@@ -1665,9 +1823,8 @@ void LoopNest::recompute_inlined_features(const StageMap<Sites> &sites, StageMap
         inlined_feat.inlined_calls += intermediate.inlined_calls;
         inlined_feat.num_scalars += intermediate.num_scalars;
         if (inlined_feat.innermost_pure_loop_extent > 0) {
-            inlined_feat.innermost_pure_loop_extent =
-                std::min(inlined_feat.innermost_pure_loop_extent,
-                         intermediate.innermost_pure_loop_extent);
+            inlined_feat.innermost_pure_loop_extent = std::min(inlined_feat.innermost_pure_loop_extent,
+                                                               intermediate.innermost_pure_loop_extent);
         } else {
             inlined_feat.innermost_pure_loop_extent = intermediate.innermost_pure_loop_extent;
         }
@@ -1730,8 +1887,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         size_t i = size[idx];
         loop_instances *= i;
         if (stage->loop[idx].pure && !in_impure) {
-            if (params.parallelism > 1 &&
-                (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) {
+            if (params.parallelism > 1 && (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) {
                 // Either we've picked our parallel tiling, or
                 // it's not yet determined. Assume we'll not split
                 // any loops and just stop after we hit the
@@ -1843,7 +1999,25 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                 ++stats.num_memoization_misses;
             }
 
-            c->compute_features(dag, params, target, sites, subinstances, parallelism, this, parent, root, gpu_loop_info, use_memoized_features, total_shared_mem_alloc_sizes, &working_set_here, &working_set_here_local_constant, &working_set_here_local_dynamic, features, stats, verbose);
+            c->compute_features(dag,
+                                params,
+                                target,
+                                sites,
+                                subinstances,
+                                parallelism,
+                                this,
+                                parent,
+                                root,
+                                gpu_loop_info,
+                                use_memoized_features,
+                                total_shared_mem_alloc_sizes,
+                                &working_set_here,
+                                &working_set_here_local_constant,
+                                &working_set_here_local_dynamic,
+                                features,
+                                stats,
+                                verbose);
+
             if (use_memoized_features) {
                 c->features[hash_of_producers].make_large(dag.nodes[0].stages[0].max_id);
                 c->memoize_features(c->features[hash_of_producers], features);
@@ -1906,7 +2080,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                 for (auto *e : node->outgoing_edges) {
                     points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls;
                 }
-                feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined);
+                feat.points_computed_minimum = std::min(feat.points_computed_minimum,
+                                                        (double)points_computed_minimum_if_inlined);
             }
 
             // When memoizing, we need to recompute features for inlined Funcs
@@ -2142,9 +2317,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
     }
 
     if (innermost) {
-        bool parent_unrolled =
-            (feat.innermost_pure_loop_extent <= get_unroll_limit(target) &&
-             parent->node == node);
+        bool parent_unrolled = (feat.innermost_pure_loop_extent <= get_unroll_limit(target) && parent->node == node);
 
         if (parent_unrolled) {
             parent_unrolled = all(unrolled_loops(target, parent, grandparent));
@@ -2171,7 +2344,6 @@ void LoopNest::compute_features(const FunctionDAG &dag,
     int64_t global_lines_loaded = 0, shared_lines_loaded = 0, local_lines_loaded = 0, register_lines_loaded = 0;
     int64_t global_bytes_loaded_per_thread = 0, shared_bytes_loaded_per_thread = 0, register_bytes_loaded_per_thread = 0;
     int64_t global_lines_loaded_per_thread = 0, shared_lines_loaded_per_thread = 0, register_lines_loaded_per_thread = 0;
-    ;
     int64_t global_allocation_bytes_loaded = 0, shared_allocation_bytes_loaded = 0;
     GlobalMemInfo global_mem_loads;
     SharedMemInfo shared_mem_loads;
@@ -2193,21 +2365,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
             inner_serial_loop_extents_computed = true;
             auto store_jac = *stage->store_jacobian;
 
-            compute_gpu_store_features(
-                store_jac,
-                vector_dim,
-                stage->node,
-                bounds,
-                gpu_loop_info,
-                inner_serial_loop_extents,
-                consumer_site,
-                feat,
-                parent,
-                root,
-                global_mem_loads,
-                shared_mem_loads,
-                local_mem_loads,
-                verbose);
+            compute_gpu_store_features(store_jac,
+                                       vector_dim,
+                                       stage->node,
+                                       bounds,
+                                       gpu_loop_info,
+                                       inner_serial_loop_extents,
+                                       consumer_site,
+                                       feat,
+                                       parent,
+                                       root,
+                                       global_mem_loads,
+                                       shared_mem_loads,
+                                       local_mem_loads,
+                                       verbose);
         }
 
         // The parallel loop of the consumer
@@ -2352,24 +2523,41 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 sanitize_names(consumer_name);
                                 std::string producer_name = e->producer->func.name();
                                 sanitize_names(producer_name);
-                                aslog(2) << "BEGIN MEM ACCESS shared_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n";
+                                aslog(2) << "BEGIN MEM ACCESS shared_mem_load. "
+                                         << "consumer: " << consumer_name
+                                         << "_s" << stage->index
+                                         << "; producer: " << producer_name << "\n";
                             }
 
-                            int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Shared, verbose);
+                            int64_t points_accessed = points_accessed_per_thread(params,
+                                                                                 target,
+                                                                                 gpu_loop_info,
+                                                                                 edge_chain,
+                                                                                 jac.first,
+                                                                                 parent,
+                                                                                 grandparent,
+                                                                                 n,
+                                                                                 feat,
+                                                                                 serial_jac.first,
+                                                                                 producer_has_been_scheduled,
+                                                                                 producer_innermost_dim,
+                                                                                 GPUMemoryType::Shared,
+                                                                                 verbose);
+
+                            compute_mem_load_features<SharedMem>(jac.first,
+                                                                 producer_innermost_dim,
+                                                                 e->producer,
+                                                                 producer_store_bounds,
+                                                                 producer_has_been_scheduled,
+                                                                 gpu_loop_info.get_thread_info(),
+                                                                 shared_mem_loads,
+                                                                 points_accessed,
+                                                                 verbose);
 
-                            compute_mem_load_features<SharedMem>(
-                                jac.first,
-                                producer_innermost_dim,
-                                e->producer,
-                                producer_store_bounds,
-                                producer_has_been_scheduled,
-                                gpu_loop_info.get_thread_info(),
-                                shared_mem_loads,
-                                points_accessed,
-                                verbose);
                             if (verbose) {
                                 aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n";
-                                aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name();
+                                aslog(2) << "END MEM ACCESS shared_mem_load. consumer: " << node->func.name()
+                                         << "; producer: " << e->producer->func.name();
                                 if (!jac.first.all_coeffs_exist()) {
                                     aslog(1) << " (not all coeffs exist)";
                                 }
@@ -2383,25 +2571,39 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 sanitize_names(consumer_name);
                                 std::string producer_name = e->producer->func.name();
                                 sanitize_names(producer_name);
-                                aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n";
+                                aslog(2) << "BEGIN MEM ACCESS global_mem_load. consumer: " << consumer_name << "_s"
+                                         << stage->index << "; producer: " << producer_name << "\n";
                             }
 
-                            int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, serial_jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Global, verbose);
-
-                            compute_mem_load_features<GlobalMem>(
-                                jac.first,
-                                producer_innermost_dim,
-                                e->producer,
-                                producer_store_bounds,
-                                producer_has_been_scheduled,
-                                gpu_loop_info.get_thread_info(),
-                                global_mem_loads,
-                                points_accessed,
-                                verbose);
+                            int64_t points_accessed = points_accessed_per_thread(params,
+                                                                                 target,
+                                                                                 gpu_loop_info,
+                                                                                 edge_chain,
+                                                                                 jac.first,
+                                                                                 parent,
+                                                                                 grandparent,
+                                                                                 n,
+                                                                                 feat,
+                                                                                 serial_jac.first,
+                                                                                 producer_has_been_scheduled,
+                                                                                 producer_innermost_dim,
+                                                                                 GPUMemoryType::Global,
+                                                                                 verbose);
+
+                            compute_mem_load_features<GlobalMem>(jac.first,
+                                                                 producer_innermost_dim,
+                                                                 e->producer,
+                                                                 producer_store_bounds,
+                                                                 producer_has_been_scheduled,
+                                                                 gpu_loop_info.get_thread_info(),
+                                                                 global_mem_loads,
+                                                                 points_accessed,
+                                                                 verbose);
 
                             if (verbose) {
                                 aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n";
-                                aslog(2) << "END MEM ACCESS global_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name();
+                                aslog(2) << "END MEM ACCESS global_mem_load. consumer: " << node->func.name()
+                                         << "; producer: " << e->producer->func.name();
                                 if (!jac.first.all_coeffs_exist()) {
                                     aslog(2) << " (not all coeffs exist)";
                                 }
@@ -2423,10 +2625,24 @@ void LoopNest::compute_features(const FunctionDAG &dag,
                                 sanitize_names(consumer_name);
                                 std::string producer_name = e->producer->func.name();
                                 sanitize_names(producer_name);
-                                aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s" << stage->index << "; producer: " << producer_name << "\n";
+                                aslog(2) << "BEGIN MEM ACCESS local_mem_load. consumer: " << consumer_name << "_s"
+                                         << stage->index << "; producer: " << producer_name << "\n";
                             }
 
-                            int64_t points_accessed = points_accessed_per_thread(params, target, gpu_loop_info, edge_chain, jac.first, parent, grandparent, n, feat, jac.first, producer_has_been_scheduled, producer_innermost_dim, GPUMemoryType::Local, verbose);
+                            int64_t points_accessed = points_accessed_per_thread(params,
+                                                                                 target,
+                                                                                 gpu_loop_info,
+                                                                                 edge_chain,
+                                                                                 jac.first,
+                                                                                 parent,
+                                                                                 grandparent,
+                                                                                 n,
+                                                                                 feat,
+                                                                                 jac.first,
+                                                                                 producer_has_been_scheduled,
+                                                                                 producer_innermost_dim,
+                                                                                 GPUMemoryType::Local,
+                                                                                 verbose);
 
                             compute_mem_load_features<LocalMem>(
                                 jac.first,
@@ -2441,7 +2657,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
                             if (verbose) {
                                 aslog(2) << "num_blocks = " << gpu_loop_info.num_blocks << "\n";
-                                aslog(2) << "END MEM ACCESS local_mem_load. consumer: " << node->func.name() << "; producer: " << e->producer->func.name();
+                                aslog(2) << "END MEM ACCESS local_mem_load. consumer: " << node->func.name()
+                                         << "; producer: " << e->producer->func.name();
                                 if (!jac.first.all_coeffs_exist()) {
                                     aslog(2) << " (not all coeffs exist)";
                                 }
@@ -2460,7 +2677,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
                 // Now look at the shapes of the regions read from
                 // the producer at various sites.
-                int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1;
+                int64_t max_extent = 1, max_thread_extent = 1, max_compute_extent = 1, max_store_extent = 1,
+                        max_task_extent = 1;
                 for (int i = 0; i < e->producer->dimensions; i++) {
                     auto p = bounds->region_required(i);
                     auto compute_p = producer_compute_bounds->region_computed(i);
@@ -2469,7 +2687,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
                     // Check some invariants
                     internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n";
-                    internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n";
+                    internal_assert(compute_p.min() <= compute_p.max())
+                        << compute_p.min() << " " << compute_p.max() << "\n";
                     internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n";
 
                     int64_t thread_extent = 1;
@@ -2521,7 +2740,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
                 if (!e->producer->is_input) {
                     const int64_t producer_store_instances =
-                        producer_has_been_scheduled ? features->get_or_create(&(e->producer->stages[0])).num_realizations : site.num_realizations;
+                        producer_has_been_scheduled ?
+                            features->get_or_create(&(e->producer->stages[0])).num_realizations :
+                            site.num_realizations;
 
                     internal_assert(producer_store_instances > 0);
 
@@ -2621,7 +2842,8 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         internal_assert(global_bytes_loaded >= 0) << "Negative global bytes loaded: " << global_bytes_loaded << "\n";
         internal_assert(shared_bytes_loaded >= 0) << "Negative shared bytes loaded: " << shared_bytes_loaded << "\n";
         internal_assert(local_bytes_loaded >= 0) << "Negative local bytes loaded: " << local_bytes_loaded << "\n";
-        internal_assert(register_bytes_loaded >= 0) << "Negative register bytes loaded: " << register_bytes_loaded << "\n";
+        internal_assert(register_bytes_loaded >= 0)
+            << "Negative register bytes loaded: " << register_bytes_loaded << "\n";
 
         feat.global_allocation_bytes_read_per_realization = global_allocation_bytes_loaded;
         feat.shared_allocation_bytes_read_per_realization = shared_allocation_bytes_loaded;
@@ -2637,24 +2859,29 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         if (!at_pure_production) {
             // Also pessimistically assume this update definition relies on the entirety of the produced region so far.
             // TODO: This overbills scatters, or writes to a sub-window.
-            internal_assert(feat.bytes_at_production >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n";
+            internal_assert(feat.bytes_at_production >= 0)
+                << "Negative bytes at production: " << feat.bytes_at_production << "\n";
 
             const auto &consumer_site = sites.get(&node->stages[0]);
             if (consumer_site.is_stored_in_global_mem()) {
                 feat.unique_global_bytes_read_per_realization += feat.bytes_at_production;
-                feat.unique_global_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
+                feat.unique_global_lines_read_per_realization +=
+                    feat.bytes_at_production / feat.innermost_bytes_at_production;
                 feat.global_allocation_bytes_read_per_realization += feat.bytes_at_production;
             } else if (consumer_site.is_stored_in_shared_mem()) {
                 feat.unique_shared_bytes_read_per_realization += feat.bytes_at_production;
-                feat.unique_shared_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
+                feat.unique_shared_lines_read_per_realization +=
+                    feat.bytes_at_production / feat.innermost_bytes_at_production;
                 feat.shared_allocation_bytes_read_per_realization += feat.bytes_at_production;
             } else if (consumer_site.is_stored_in_local_mem()) {
                 // feat.unique_local_bytes_read_per_realization += feat.bytes_at_production;
-                // feat.unique_local_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
-                // feat.local_allocation_bytes_read_per_realization += feat.bytes_at_production;
+                // feat.unique_local_lines_read_per_realization += feat.bytes_at_production /
+                // feat.innermost_bytes_at_production; feat.local_allocation_bytes_read_per_realization +=
+                // feat.bytes_at_production;
             } else if (consumer_site.is_stored_in_registers()) {
                 feat.unique_register_bytes_read_per_realization += feat.bytes_at_production;
-                feat.unique_register_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
+                feat.unique_register_lines_read_per_realization +=
+                    feat.bytes_at_production / feat.innermost_bytes_at_production;
                 feat.register_allocation_bytes_read_per_realization += feat.bytes_at_production;
             } else {
                 internal_assert(false);
@@ -2675,8 +2902,10 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 
         feat.points_computed_per_production = subinstances / feat.num_productions;
 
-        feat.unique_bytes_read_per_point = global_bytes_loaded + shared_bytes_loaded + local_bytes_loaded + register_bytes_loaded;
-        feat.unique_lines_read_per_point = global_lines_loaded + shared_lines_loaded + local_lines_loaded + register_bytes_loaded;
+        feat.unique_bytes_read_per_point =
+            global_bytes_loaded + shared_bytes_loaded + local_bytes_loaded + register_bytes_loaded;
+        feat.unique_lines_read_per_point =
+            global_lines_loaded + shared_lines_loaded + local_lines_loaded + register_bytes_loaded;
 
         feat.num_global_mem_loads_per_block = global_mem_loads.num_transactions();
         feat.global_mem_load_efficiency = global_mem_loads.efficiency();
@@ -2684,9 +2913,11 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         feat.num_shared_mem_loads_per_block = shared_mem_loads.num_transactions();
         feat.shared_mem_load_efficiency = shared_mem_loads.efficiency();
 
-        internal_assert(in_range_zero_one(feat.global_mem_load_efficiency)) << "Invalid global mem load efficiency: " << feat.global_mem_load_efficiency;
+        internal_assert(in_range_zero_one(feat.global_mem_load_efficiency))
+            << "Invalid global mem load efficiency: " << feat.global_mem_load_efficiency;
 
-        internal_assert(in_range_zero_one(feat.shared_mem_load_efficiency)) << "Invalid shared mem load efficiency: " << feat.shared_mem_load_efficiency;
+        internal_assert(in_range_zero_one(feat.shared_mem_load_efficiency))
+            << "Invalid shared mem load efficiency: " << feat.shared_mem_load_efficiency;
     }
 
     // Track features for inlined Funcs
@@ -2698,8 +2929,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
         inlined_feat.num_scalars += it.value() * feat.num_scalars;
         if (inlined_feat.innermost_pure_loop_extent > 0) {
             inlined_feat.innermost_pure_loop_extent =
-                std::min(inlined_feat.innermost_pure_loop_extent,
-                         feat.innermost_pure_loop_extent);
+                std::min(inlined_feat.innermost_pure_loop_extent, feat.innermost_pure_loop_extent);
         } else {
             inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
         }
@@ -2764,16 +2994,19 @@ void LoopNest::compute_features(const FunctionDAG &dag,
 // required of 'g' should be 1 point for each point of 'out' but get_bounds()
 // will also include the edge 'g' -> 'f' and give the result 201 points for every point
 // of 'out')
-Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector<const FunctionDAG::Edge *> &edge_chain) const {
+Bound LoopNest::get_bounds_along_edge_chain(const FunctionDAG::Node *f,
+                                            const vector<const FunctionDAG::Edge *> &edge_chain) const {
     internal_assert(!edge_chain.empty());
 
     internal_assert(edge_chain[0]->consumer == stage)
-        << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's node. But the given edge chain begins with " << edge_chain[0]->consumer->node->func.name()
-        << " not " << node->func.name();
+        << "get_bounds_along_edge_chain must be called with an edge chain that begins from the current loop nest's "
+           "node. But the given edge chain begins with "
+        << edge_chain[0]->consumer->node->func.name() << " not " << node->func.name();
 
     internal_assert(edge_chain.back()->producer == f)
-        << "get_bounds_along_edge_chain must be called with an edge chain that ends with the given node. But the given edge chain ends with " << edge_chain.back()->producer->func.name()
-        << " not " << f->func.name();
+        << "get_bounds_along_edge_chain must be called with an edge chain that ends with the given node. But the given "
+           "edge chain ends with "
+        << edge_chain.back()->producer->func.name() << " not " << f->func.name();
 
     vector<Bound> bounds;
     BoundContents *bound;
@@ -2841,9 +3074,8 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const {
             bound->region_required(i) = f->estimated_region_required[i];
         }
     } else {
-        internal_assert(!f->outgoing_edges.empty())
-            << "No consumers of " << f->func.name()
-            << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n";
+        internal_assert(!f->outgoing_edges.empty()) << "No consumers of " << f->func.name() << " at loop over "
+                                                    << (is_root() ? "root" : node->func.name()) << "\n";
         auto init = Span::empty_span();
         for (int i = 0; i < f->dimensions; i++) {
             bound->region_required(i) = init;
@@ -2851,9 +3083,7 @@ const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const {
 
         for (const auto *e : f->outgoing_edges) {
             // Ignore consumers outside of this loop nest
-            if (!is_root() &&
-                (stage != e->consumer) &&
-                (!stage->downstream_of(*(e->consumer->node)))) {
+            if (!is_root() && (stage != e->consumer) && (!stage->downstream_of(*(e->consumer->node)))) {
                 continue;
             }
             const auto &c_bounds = get_bounds(e->consumer->node);
@@ -3165,9 +3395,7 @@ bool LoopNest::compute_here(const FunctionDAG::Node *f,
 
             internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n";
 
-            if (f->dimensions &&
-                node->size[i] >= 1 &&
-                f->stages[s].loop[i].var == f->func.args()[v]) {
+            if (f->dimensions && node->size[i] >= 1 && f->stages[s].loop[i].var == f->func.args()[v]) {
                 node->vectorized_loop_index = (int)i;
                 vector_size = (int64_t)(node->stage->vector_size);
                 single_point->loops(s, i).set_extent(vector_size);
@@ -3239,7 +3467,6 @@ IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const vector<int64_t
                                                             bool adjust_tiling,
                                                             bool move_all_rvars_inward,
                                                             const vector<int> &rvars_to_move_inward) const {
-
     // Split this loop and move factors to the inner loop
     LoopNest *inner = new LoopNest, *outer = new LoopNest;
     inner->node = outer->node = node;
@@ -3476,11 +3703,7 @@ vector<IntrusivePtr<const LoopNest>> LoopNest::compute_in_tiles(const FunctionDA
     can_compute_here = can_compute_here || (in_threads_loop && search_space_options.compute_at_thread());
 
     // Place the computation directly inside this loop (provided it's not a SIMD loop)
-    if (!innermost &&
-        (!in_realization ||
-         size.empty() ||
-         vector_dim == -1 ||
-         size[vector_dim] == 1) &&
+    if (!innermost && (!in_realization || size.empty() || vector_dim == -1 || size[vector_dim] == 1) &&
         can_compute_here) {
 
         std::unique_ptr<LoopNest> r{new LoopNest};
@@ -3527,7 +3750,16 @@ vector<IntrusivePtr<const LoopNest>> LoopNest::compute_in_tiles(const FunctionDA
 
             in_threads_loop |= (children[child]->gpu_label == GPU_parallelism::Thread);
             // we must pass down union thread count constraints computed at block level when computing further in
-            auto opts = children[child]->compute_in_tiles(f, this, params, target, search_space_options, v, store_here, in_threads_loop, false, union_counts);
+            auto opts = children[child]->compute_in_tiles(f,
+                                                          this,
+                                                          params,
+                                                          target,
+                                                          search_space_options,
+                                                          v,
+                                                          store_here,
+                                                          in_threads_loop,
+                                                          false,
+                                                          union_counts);
             for (IntrusivePtr<const LoopNest> &n : opts) {
                 // (Only valid if one child calls f) Push the
                 // computation into the child. Possibly leaving
@@ -3643,7 +3875,8 @@ bool LoopNest::producer_computed_here_or_further_in(const FunctionDAG::Node *pro
     return false;
 }
 
-void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants, const LoopNest *compute_root_loop_nest) const {
+void LoopNest::get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants,
+                                                             const LoopNest *compute_root_loop_nest) const {
     if (is_root()) {
         for (const auto &c : children) {
             descendants.emplace(c->stage, {});
@@ -3719,8 +3952,7 @@ void LoopNest::apply(LoopLevel here,
                 state->vars.push_back(fv);
             }
             // Bubble the innermost pure dimension to the front of the pure dimensions
-            for (int i = vectorized_loop_index - 1;
-                 i >= 0 && state->vars[i].pure; i--) {
+            for (int i = vectorized_loop_index - 1; i >= 0 && state->vars[i].pure; i--) {
                 std::swap(state->vars[i], state->vars[i + 1]);
             }
             state_map.emplace(stage, std::unique_ptr<StageScheduleState>(state));
@@ -3792,8 +4024,7 @@ void LoopNest::apply(LoopLevel here,
                     // stage's types and will often be 1, in which case we
                     // don't want to vectorize the loop
                     if (!target.has_gpu_feature() || stage->vector_size > 1) {
-                        state.schedule_source
-                            << "\n    .vectorize(" << v.var.name() << ")";
+                        state.schedule_source << "\n    .vectorize(" << v.var.name() << ")";
                         s.vectorize(v.var);
                         v.vectorized = true;
                         state.vectorized = true;
@@ -3849,7 +4080,8 @@ void LoopNest::apply(LoopLevel here,
                         }
 
                         auto tail_strategy = pure_var_tail_strategy;
-                        // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead.
+                        // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if
+                        // instead.
 
                         // If the factor evenly divides the parent extent, then
                         // no tail strategy is needed
@@ -3863,13 +4095,9 @@ void LoopNest::apply(LoopLevel here,
                         }
 
                         s.split(parent.var, parent.var, inner, (int)factor, tail_strategy);
-                        state.schedule_source
-                            << "\n    .split("
-                            << parent.var.name() << ", "
-                            << parent.var.name() << ", "
-                            << inner.name() << ", "
-                            << factor << ", "
-                            << "TailStrategy::" << tail_strategy << ")";
+                        state.schedule_source << "\n    .split(" << parent.var.name() << ", " << parent.var.name()
+                                              << ", " << inner.name() << ", " << factor << ", "
+                                              << "TailStrategy::" << tail_strategy << ")";
                         v = parent;
                         parent.extent = size[parent.index];
                         v.constant_extent = (!parent.var.is_rvar && parent.exists);
@@ -3927,7 +4155,8 @@ void LoopNest::apply(LoopLevel here,
                 if (!found) {
                     here = LoopLevel(node->func, Var::outermost());
                 }
-                // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n";
+                // internal_assert(found) << "Could not find appropriate compute_at location for children of " <<
+                // node->func.name() << "\n";
                 state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end());
             }
         }
@@ -3984,7 +4213,8 @@ void LoopNest::apply(LoopLevel here,
     }
 }
 
-void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const NodeMap<bool> &all_inlined) const {
+void LoopNest::update_producers_to_be_staged(StageScheduleState &state,
+                                             const NodeMap<bool> &all_inlined) const {
     std::vector<pair<const FunctionDAG::Node::Stage *, vector<const FunctionDAG::Edge *>>> pending;
     std::vector<const FunctionDAG::Edge *> edge_chain;
     pending.emplace_back(stage, edge_chain);
@@ -4017,7 +4247,8 @@ void LoopNest::update_producers_to_be_staged(StageScheduleState &state, const No
                 continue;
             }
 
-            if (other_stage_has_same_producer(e->producer) || producer_computed_here_or_further_in(e->producer) || !e->all_load_jacobian_coeffs_exist()) {
+            if (other_stage_has_same_producer(e->producer) || producer_computed_here_or_further_in(e->producer) ||
+                !e->all_load_jacobian_coeffs_exist()) {
                 continue;
             }
 
@@ -4053,7 +4284,8 @@ bool LoopNest::has_valid_thread_extents() const {
     return true;
 }
 
-void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze, NodeMap<bool> &inlined_nodes) const {
+void LoopNest::collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze,
+                                                    NodeMap<bool> &inlined_nodes) const {
     if (innermost) {
         for (auto it = inlined.begin(); it != inlined.end(); it++) {
             const auto *f = it.key();
diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h
index b315d460e9c7..b0dee6cedf24 100644
--- a/src/autoschedulers/anderson2021/LoopNest.h
+++ b/src/autoschedulers/anderson2021/LoopNest.h
@@ -29,21 +29,25 @@ using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;
 template<typename T>
 using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;
 
-enum class GPU_parallelism { Block,
-                             Thread,
-                             Serial,
-                             Simd,
-                             Parallelized,
-                             None };
+enum class GPU_parallelism {
+    Block,
+    Thread,
+    Serial,
+    Simd,
+    Parallelized,
+    None
+};
 
 std::string stringify(GPU_parallelism label);
 
 // inlined => func is inlined so has no memory store location
-enum class GPUMemoryType { Global,
-                           Shared,
-                           Local,
-                           Registers,
-                           Inlined };
+enum class GPUMemoryType {
+    Global,
+    Shared,
+    Local,
+    Registers,
+    Inlined
+};
 
 bool may_subtile(const Anderson2021Params &params);
 
@@ -234,9 +238,13 @@ struct LoopNest {
         }
     };
 
-    GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined = false) const;
+    GPUMemoryType get_gpu_memory_type(bool in_block,
+                                      bool in_thread,
+                                      bool is_inlined = false) const;
 
-    std::vector<int> unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const;
+    std::vector<int> unrolled_loops(const Target &target,
+                                    const LoopNest *parent,
+                                    const LoopNest *grandparent) const;
 
     void get_allocs_that_can_be_promoted_to_registers(const Target &target,
                                                       StageMap<Sites> &sites,
@@ -244,7 +252,8 @@ struct LoopNest {
                                                       const LoopNest *grandparent,
                                                       const LoopNest *parent) const;
 
-    bool promote_allocs_to_registers(const Target &target, StageMap<Sites> &sites) const;
+    bool promote_allocs_to_registers(const Target &target,
+                                     StageMap<Sites> &sites) const;
 
     // Compute all the sites of interest for each pipeline stage
     void get_sites(const Target &target,
@@ -265,7 +274,9 @@ struct LoopNest {
         }
     }
 
-    bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const;
+    bool exceeds_serial_extents_limit(const Target &target,
+                                      const LoopNest *parent,
+                                      bool in_threads_loop) const;
 
     bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const;
 
@@ -279,39 +290,105 @@ struct LoopNest {
 
     // Get the stride over "node's" storage for a unit increment in the vectorized loop's
     // index
-    double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const;
-
-    Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose = false) const;
-
-    bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const;
+    double storage_stride(const LoadJacobian &jac,
+                          int innermost_storage_dim,
+                          const FunctionDAG::Node *storage_node,
+                          const Bound &store_bounds,
+                          const LoopNest &root) const;
+
+    Strides compute_strides(const LoadJacobian &jac,
+                            int innermost_storage_dim,
+                            const FunctionDAG::Node *storage_node,
+                            const Bound &store_bounds,
+                            const ThreadInfo *thread_info,
+                            bool verbose = false) const;
+
+    bool all_strides_exist(const LoadJacobian &jac,
+                           const FunctionDAG::Node *storage_node,
+                           const LoopNest &root) const;
 
     int get_actual_vector_dim(const Bound &store_bounds) const;
 
-    void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector<int64_t> &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose = false) const;
-
-    bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const;
-
-    bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const;
-
-    int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
-
-    int vectorized_access_size(size_t loop_index, bool verbose = false) const;
+    void compute_gpu_store_features(const LoadJacobian &jac,
+                                    int consumer_innermost_dim,
+                                    const FunctionDAG::Node *node,
+                                    const Bound &consumer_store_bounds,
+                                    const GPULoopInfo &gpu_loop_info,
+                                    const std::vector<int64_t> &inner_serial_loop_extents,
+                                    const Sites &consumer_site,
+                                    ScheduleFeatures &feat,
+                                    const LoopNest *parent,
+                                    const LoopNest &root,
+                                    GlobalMemInfo &global_mem_loads,
+                                    SharedMemInfo &shared_mem_loads,
+                                    LocalMemInfo &local_mem_loads,
+                                    bool verbose = false) const;
+
+    bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac,
+                                                const FunctionDAG::Node *accessed,
+                                                int innermost_dim,
+                                                int loop_index) const;
+
+    bool can_vectorize_store_access(const LoadJacobian &jac,
+                                    const FunctionDAG::Node *accessed,
+                                    bool accessed_has_been_scheduled,
+                                    int innermost_dim,
+                                    int loop_index,
+                                    const GPUMemoryType &mem_type) const;
+
+    int vectorized_load_access_size(const LoadJacobian &jac,
+                                    const FunctionDAG::Node *accessed,
+                                    bool accessed_has_been_scheduled,
+                                    int innermost_dim,
+                                    const GPUMemoryType &mem_type,
+                                    bool verbose = false) const;
+
+    int vectorized_access_size(size_t loop_index,
+                               bool verbose = false) const;
 
     template<typename T>
-    void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose = false) const;
-
-    std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const;
+    void compute_num_mem_accesses_per_block(const LoadJacobian &jac,
+                                            const FunctionDAG::Node *node,
+                                            const Bound &store_bounds,
+                                            const ThreadInfo *thread_info,
+                                            int innermost_dim,
+                                            double num_requests_per_warp,
+                                            MemInfoType<T> &mem_info,
+                                            bool verbose = false) const;
+
+    std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac,
+                                                               int consumer_innermost_dim,
+                                                               const FunctionDAG::Node *node,
+                                                               const Bound &consumer_store_bounds,
+                                                               const LoopNest &root,
+                                                               double serial_loop_extents) const;
 
     template<typename T>
-    MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
+    MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac,
+                                          int consumer_innermost_dim,
+                                          const FunctionDAG::Node *node,
+                                          const Bound &consumer_store_bounds,
+                                          const ThreadInfo *thread_info,
+                                          double serial_loop_extents,
+                                          bool verbose) const;
 
     template<typename T>
-    void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double serial_loop_extents, bool verbose = false) const;
-
-    double compute_local_mem_stride(double stride, double bytes) const;
+    void compute_mem_load_features(const LoadJacobian &jac,
+                                   int producer_innermost_dim,
+                                   const FunctionDAG::Node *node,
+                                   const Bound &producer_store_bounds,
+                                   bool producer_has_been_scheduled,
+                                   const ThreadInfo *thread_info,
+                                   MemInfoType<T> &mem_info,
+                                   double serial_loop_extents,
+                                   bool verbose = false) const;
+
+    double compute_local_mem_stride(double stride,
+                                    double bytes) const;
 
     // Assumes block, serial, thread or block, thread nesting
-    const LoopNest *get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const;
+    const LoopNest *get_enclosing_block(const LoopNest *parent,
+                                        const LoopNest *grandparent) const;
 
     std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;
 
@@ -319,20 +396,44 @@ struct LoopNest {
 
     bool has_thread_loop_descendant() const;
 
-    void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const;
+    void compute_warp_features(ScheduleFeatures &features,
+                               const GPULoopInfo &gpu_loop_info) const;
 
     // Assume that when a block is active, all its warps are active
-    void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const;
+    void compute_warp_and_block_occupancy(const Anderson2021Params &params,
+                                          ScheduleFeatures &feat,
+                                          const GPULoopInfo &gpu_loop_info) const;
 
-    void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const;
+    void compute_shared_mem_occupancy(const Anderson2021Params &params,
+                                      const Target &target,
+                                      int64_t total_shared_mem_alloc_size,
+                                      ScheduleFeatures &feat) const;
 
     std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;
 
-    int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector<const FunctionDAG::Edge *> &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose = false) const;
-
-    int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const;
-
-    void memoize_points_computed_minimum(StageMap<ScheduleFeatures> &memoized_features, const StageMap<ScheduleFeatures> *features) const;
+    int64_t points_accessed_per_thread(const Anderson2021Params &params,
+                                       const Target &target,
+                                       const GPULoopInfo &gpu_loop_info,
+                                       const std::vector<const FunctionDAG::Edge *> &edge_chain,
+                                       const LoadJacobian &jac,
+                                       const LoopNest *parent,
+                                       const LoopNest *grandparent,
+                                       int64_t n,
+                                       const ScheduleFeatures &feat,
+                                       const LoadJacobian &serial_jac,
+                                       bool producer_has_been_scheduled,
+                                       int producer_innermost_dim,
+                                       const GPUMemoryType &mem_type,
+                                       bool verbose) const;
+
+    int64_t compute_licm_amortization(const LoopNest *innermost,
+                                      const LoopNest *parent,
+                                      const ScheduleFeatures &feat,
+                                      const LoadJacobian &jac,
+                                      int producer_dims) const;
+
+    void memoize_points_computed_minimum(StageMap<ScheduleFeatures> &memoized_features,
+                                         const StageMap<ScheduleFeatures> *features) const;
 
     vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
 
@@ -340,12 +441,14 @@ struct LoopNest {
 
     void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
 
-    void memoize_features(StageMap<ScheduleFeatures> &memoized_features, const StageMap<ScheduleFeatures> *features) const;
+    void memoize_features(StageMap<ScheduleFeatures> &memoized_features,
+                          const StageMap<ScheduleFeatures> *features) const;
 
     void compute_working_set_from_features(int64_t *working_set,
                                            const StageMap<ScheduleFeatures> *features) const;
 
-    void recompute_inlined_features(const StageMap<Sites> &sites, StageMap<ScheduleFeatures> *features) const;
+    void recompute_inlined_features(const StageMap<Sites> &sites,
+                                    StageMap<ScheduleFeatures> *features) const;
 
     std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;
 
@@ -389,7 +492,8 @@ struct LoopNest {
     // consumers along the given edge chain), from which we know what region
     // would be computed if it were scheduled here and what its loop nest
     // would be.
-    Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector<const FunctionDAG::Edge *> &edge_chain) const;
+    Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f,
+                                      const vector<const FunctionDAG::Edge *> &edge_chain) const;
 
     void dump() const;
 
@@ -443,13 +547,16 @@ struct LoopNest {
                                                       bool move_all_rvars_inward = true,
                                                       const vector<int> &rvars_to_move_inward = {}) const;
 
-    int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false, bool in_threads_loop = false) const;
+    int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false,
+                                           bool in_threads_loop = false) const;
     int64_t get_total_constant_local_mem_alloc_size() const;
 
     // All store ats further in than the block level must be fixed
     // sized allocations. This method checks if f will require a dynamic
     // allocation
-    bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const;
+    bool requires_dynamic_allocation(const FunctionDAG::Node *f,
+                                     const Target &target,
+                                     bool in_threads_loop) const;
 
     // Return all possible ways to compute f in tiles somewhere within
     // this loop nest.
@@ -501,18 +608,19 @@ struct LoopNest {
             size_t index = 0;
 
             // Some flags.
-            bool innermost_pure_dim = false,
-                 outermost = false,
-                 parallel = false,
-                 exists = false,
-                 pure = false,
-                 constant_extent = false;
+            bool innermost_pure_dim = false;
+            bool outermost = false;
+            bool parallel = false;
+            bool exists = false;
+            bool pure = false;
+            bool constant_extent = false;
 
             bool vectorized = false;
             bool gpu_threads = false;
 
             FuncVar()
-                : orig(Var()), var(Var()) {
+                : orig(Var()),
+                  var(Var()) {
             }
         };
         const FunctionDAG::Node *node;
@@ -529,7 +637,8 @@ struct LoopNest {
         vector<FuncVar> ordered_vars;
         vector<int64_t> gpu_thread_extents;
 
-        NodeMap<std::vector<std::pair<const LoopNest *, std::vector<const FunctionDAG::Edge *>>>> producers_to_be_staged;
+        NodeMap<std::vector<std::pair<const LoopNest *, std::vector<const FunctionDAG::Edge *>>>>
+            producers_to_be_staged;
 
         // From outermost in
         vector<StageScheduleState *> ancestors;
@@ -544,8 +653,10 @@ struct LoopNest {
     int num_serial_loops() const;
     bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const;
 
-    void update_producers_to_be_staged(StageScheduleState &state, const NodeMap<bool> &all_inlined) const;
-    bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const;
+    void update_producers_to_be_staged(StageScheduleState &state,
+                                       const NodeMap<bool> &all_inlined) const;
+    bool region_computed_shrinks(const FunctionDAG::Node *f,
+                                 const LoopNest *parent) const;
 
     // Apply the schedule represented by this loop nest to a Halide pipeline.
     void apply(LoopLevel here,
@@ -558,18 +669,21 @@ struct LoopNest {
                std::vector<StageScheduleState *> &ancestors,
                const NodeMap<bool> &all_inlined) const;
 
-    double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const;
+    double max_idle_lane_wastage(const Target &target,
+                                 GPULoopInfo gpu_loop_info) const;
 
     bool has_valid_thread_extents() const;
 
-    void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze, NodeMap<bool> &inlined_nodes) const;
+    void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze,
+                                              NodeMap<bool> &inlined_nodes) const;
 
     void collect_all_inlined(NodeMap<bool> &all_inlined) const;
 
     int64_t product_of_self_and_descendants(int loop_index) const;
     int64_t product_of_descendants(int loop_index) const;
 
-    void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants, const LoopNest *compute_root_loop_nest = nullptr) const;
+    void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants,
+                                                       const LoopNest *compute_root_loop_nest = nullptr) const;
 };
 
 struct Filter {
@@ -577,7 +691,8 @@ struct Filter {
     bool logging = false;
 
     explicit Filter(const LoopNest *loop_nest)
-        : loop_nest{loop_nest}, logging{enable_filter_printing()} {
+        : loop_nest{loop_nest},
+          logging{enable_filter_printing()} {
         if (logging) {
             std::cerr << "\nState filtered: \n";
             loop_nest->dump();
diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp
index 278a6d30808d..87cd7cf880bd 100644
--- a/src/autoschedulers/anderson2021/SearchSpace.cpp
+++ b/src/autoschedulers/anderson2021/SearchSpace.cpp
@@ -13,11 +13,19 @@ SearchSpace::SearchSpace(const FunctionDAG &dag,
                          CostModel *cost_model,
                          Statistics &stats,
                          const LoopNestParser *partial_schedule)
-    : dag{dag}, params{params}, target{target}, search_space_options{params.search_space_options}, rng{rng}, cost_model{cost_model}, stats{stats}, partial_schedule{partial_schedule} {
+    : dag{dag},
+      params{params},
+      target{target},
+      search_space_options{params.search_space_options},
+      rng{rng},
+      cost_model{cost_model},
+      stats{stats},
+      partial_schedule{partial_schedule} {
     memoized_compute_root_blocks.make_large(dag.nodes.size());
 }
 
-void SearchSpace::memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root) {
+void SearchSpace::memoize_blocks(const FunctionDAG::Node *node,
+                                 LoopNest *new_root) {
     int vector_dim = -1;
     bool loop_nest_found = false;
     for (auto &c : new_root->children) {
@@ -141,8 +149,7 @@ vector<SearchSpace::ParallelTileOption> SearchSpace::filter_parallel_tile_option
                     }
                     const double tasks_per_core = ((double)total) / params.parallelism;
                     o.idle_core_wastage = std::max(o.idle_core_wastage,
-                                                   std::ceil(tasks_per_core) /
-                                                       tasks_per_core);
+                                                   std::ceil(tasks_per_core) / tasks_per_core);
                 }
             }
         }
@@ -391,7 +398,15 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
         std::unordered_map<uint64_t, StateVector> secondary_options;
         for (int vector_dim : vector_dims) {
             Timer timer;
-            auto tile_options = root->compute_in_tiles(node, nullptr, params, target, search_space_options, vector_dim, false, false, is_pre_pass);
+            auto tile_options = root->compute_in_tiles(node,
+                                                       nullptr,
+                                                       params,
+                                                       target,
+                                                       search_space_options,
+                                                       vector_dim,
+                                                       false,
+                                                       false,
+                                                       is_pre_pass);
             stats.compute_in_tiles_time += timer.elapsed();
 
             timer.restart();
@@ -490,7 +505,13 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
             // at root level sibling thread counts are in separate blocks, extents are irrelevant
             vector<int64_t> max_size((int)(stage_sizes[0].size()), 1);
 
-            auto block_tilings = generate_gpu_tilings(stage_sizes, pure_dims, max_size, node->dimensions - 1, vectorized_indices, false, true);
+            auto block_tilings = generate_gpu_tilings(stage_sizes,
+                                                      pure_dims,
+                                                      max_size,
+                                                      node->dimensions - 1,
+                                                      vectorized_indices,
+                                                      false,
+                                                      true);
 
             // If no options, create a thread tiling as large as possible with block size (1,1,1).
             // This can happen if the loops are too small to generate desired gpu tiles.
@@ -517,7 +538,10 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
 
             double prev_idle_core_wastage = 0;
             for (const auto &o : options) {
-                if (!params.randomize_tilings && num_children >= 1 && o.idle_core_wastage > 1.2 && o.idle_core_wastage != prev_idle_core_wastage) {
+                if (!params.randomize_tilings &&
+                    num_children >= 1 &&
+                    o.idle_core_wastage > 1.2 &&
+                    o.idle_core_wastage != prev_idle_core_wastage) {
                     // We have considered several options, and the
                     // remaining ones leave lots of cores idle.
                     break;
@@ -606,9 +630,8 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr<State> &best) {
         internal_assert(n.first >= 0);
     }
 
-    std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(), [](const std::pair<int, double> &a, const std::pair<int, double> &b) {
-        return a.second < b.second;
-    });
+    std::sort(node_ids_and_costs.begin(), node_ids_and_costs.end(),
+              [](const std::pair<int, double> &a, const std::pair<int, double> &b) { return a.second < b.second; });
 
     size_t num_to_freeze = num_nodes - std::log2(num_nodes);
     NodeMap<bool> nodes_to_freeze;
@@ -631,7 +654,8 @@ void SearchSpace::freeze_lowest_cost_stages(const IntrusivePtr<State> &best) {
     }
 }
 
-vector<vector<int64_t>> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr<const LoopNest> &pure_stage, const FunctionDAG::Node *node) const {
+vector<vector<int64_t>> SearchSpace::generate_compute_root_serial_tilings(const IntrusivePtr<const LoopNest> &pure_stage,
+                                                                          const FunctionDAG::Node *node) const {
     std::vector<int> vec_dim_serial_sizes;
     pure_stage->generate_vec_dim_serial_tilings(vec_dim_serial_sizes);
 
diff --git a/src/autoschedulers/anderson2021/SearchSpace.h b/src/autoschedulers/anderson2021/SearchSpace.h
index 1e80c0e1760f..b9dddb4c7f04 100644
--- a/src/autoschedulers/anderson2021/SearchSpace.h
+++ b/src/autoschedulers/anderson2021/SearchSpace.h
@@ -69,7 +69,8 @@ struct SearchSpace {
 
     vector<ThreadTileOption> filter_thread_tile_options(vector<IntrusivePtr<const LoopNest>> &loop_nests) const;
 
-    void memoize_blocks(const FunctionDAG::Node *node, LoopNest *new_root);
+    void memoize_blocks(const FunctionDAG::Node *node,
+                        LoopNest *new_root);
 
     bool add_states_from_memoized_blocks(const IntrusivePtr<State> &state,
                                          std::function<void(IntrusivePtr<State> &&)> &accept_child,
@@ -84,7 +85,8 @@ struct SearchSpace {
 
     void freeze_lowest_cost_stages(const IntrusivePtr<State> &best);
 
-    vector<vector<int64_t>> generate_compute_root_serial_tilings(const IntrusivePtr<const LoopNest> &pure_stage, const FunctionDAG::Node *node) const;
+    vector<vector<int64_t>> generate_compute_root_serial_tilings(const IntrusivePtr<const LoopNest> &pure_stage,
+                                                                 const FunctionDAG::Node *node) const;
 
     bool add_child(const IntrusivePtr<State> &state,
                    const IntrusivePtr<const LoopNest> &new_root,
diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp
index 60a990dff536..e58507c5b7b8 100644
--- a/src/autoschedulers/anderson2021/State.cpp
+++ b/src/autoschedulers/anderson2021/State.cpp
@@ -19,15 +19,21 @@ uint64_t State::structural_hash(int depth) const {
 }
 
 // Compute the parent and depth of every loop nest node
-void State::compute_loop_nest_parents(map<const LoopNest *, pair<const LoopNest *, int>> &p,
-                                      const LoopNest *here, int depth) const {
+void State::compute_loop_nest_parents(LoopNestMap &p,
+                                      const LoopNest *here,
+                                      int depth) const {
     for (const auto &c : here->children) {
         p.emplace(c.get(), pair<const LoopNest *, int>{here, depth});
         compute_loop_nest_parents(p, c.get(), depth + 1);
     }
 }
 
-const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params &params, const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap<int64_t> &total_shared_mem_alloc_sizes) const {
+const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params &params,
+                                                      const LoopNestMap &parent,
+                                                      const FunctionDAG::Node &node,
+                                                      const LoopNest *loop,
+                                                      const LoopNest *root,
+                                                      StageMap<int64_t> &total_shared_mem_alloc_sizes) const {
     std::vector<const LoopNest *> ancestors;
 
     // Innermost loop nests are never considered as compute locations
@@ -102,7 +108,8 @@ const LoopNest *State::deepest_valid_compute_location(const Anderson2021Params &
     return candidate;
 }
 
-int64_t State::total_loop_extents_of_ancestors(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop) const {
+int64_t State::total_loop_extents_of_ancestors(const LoopNestMap &parent,
+                                               const LoopNest *loop) const {
     int64_t total = 1;
 
     if (loop->is_root()) {
@@ -125,7 +132,9 @@ int64_t State::total_loop_extents_of_ancestors(const map<const LoopNest *, pair<
     return total;
 }
 
-const LoopNest *State::deepest_common_ancestor(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *a, const LoopNest *b) const {
+const LoopNest *State::deepest_common_ancestor(const LoopNestMap &parent,
+                                               const LoopNest *a,
+                                               const LoopNest *b) const {
     if (a->is_root()) {
         return a;
     }
@@ -343,7 +352,8 @@ void State::FeatureLoopNestMutator::add_outer_thread_loops(LoopNest *loop_nest)
     }
 }
 
-IntrusivePtr<const LoopNest> State::get_root_for_features(const Anderson2021Params &params, const Target &target) const {
+IntrusivePtr<const LoopNest> State::get_root_for_features(const Anderson2021Params &params,
+                                                          const Target &target) const {
     if (!has_compute_root_loops_without_blocks() && !has_loop_nest_without_thread_loops()) {
         return root;
     }
@@ -352,7 +362,8 @@ IntrusivePtr<const LoopNest> State::get_root_for_features(const Anderson2021Para
 
     // We copy the loop nest in 2 cases:
     // - If the current loop nest has compute root loops without blocks (it is
-    // in phase 1 and the outer loops are marked 'none'), we split the loop into blocks and threads so we can compute meaningful features
+    // in phase 1 and the outer loops are marked 'none'), we split the loop into blocks and threads so we can compute
+    // meaningful features
     // - If there are serial loops inside blocks without a surrounding
     // thread loop nest, we create a surrounding thread loop nest with
     // extents 1 (which Halide will do when the schedule is compiled) so
@@ -361,7 +372,9 @@ IntrusivePtr<const LoopNest> State::get_root_for_features(const Anderson2021Para
     return new_root;
 }
 
-void State::set_gpu_store_site(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop, LoopNest::Sites &site) const {
+void State::set_gpu_store_site(const LoopNestMap &parent,
+                               const LoopNest *loop,
+                               LoopNest::Sites &site) const {
     // If site.store is inside a block but outside a loop, the
     // GPU store site should instead be the block because the shared
     // mem allocation will be hoisted
@@ -393,7 +406,12 @@ void State::set_gpu_store_site(const map<const LoopNest *, pair<const LoopNest *
     internal_assert(type_has_been_set);
 }
 
-bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap<ScheduleFeatures> *features, Statistics &stats, bool verbose) const {
+bool State::compute_featurization(const FunctionDAG &dag,
+                                  const Anderson2021Params &params,
+                                  const Target &target,
+                                  StageMap<ScheduleFeatures> *features,
+                                  Statistics &stats,
+                                  bool verbose) const {
     auto feature_root = get_root_for_features(params, target);
 
     StageMap<LoopNest::Sites> sites;
@@ -426,7 +444,7 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para
     // For the unscheduled nodes, give them sites as deep as they
     // could possibly be. We'll ignore the possibility of inlining
     // them for now.
-    map<const LoopNest *, pair<const LoopNest *, int>> parent;
+    LoopNestMap parent;
     compute_loop_nest_parents(parent, feature_root.get(), 0);
     for (const auto &n : dag.nodes) {
         if (sites.contains(&(n.stages[0]))) {
@@ -474,14 +492,17 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para
                 }
             }
         }
-        internal_assert(loop)
-            << "Could not compute plausible site for unscheduled Func: "
-            << n.func.name() << "\n";
+        internal_assert(loop) << "Could not compute plausible site for unscheduled Func: " << n.func.name() << "\n";
 
         // If 'loop' would never be considered as a compute location (i.e. by
         // LoopNest::compute_in_tiles()), walk up the loop nest until we reach a
         // location that would be considered
-        loop = deepest_valid_compute_location(params, parent, n, loop, feature_root.get(), total_shared_mem_alloc_sizes);
+        loop = deepest_valid_compute_location(params,
+                                              parent,
+                                              n,
+                                              loop,
+                                              feature_root.get(),
+                                              total_shared_mem_alloc_sizes);
         int64_t num_realizations = total_loop_extents_of_ancestors(parent, loop);
 
         for (const auto &stage : n.stages) {
@@ -501,7 +522,24 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para
 
     Timer timer;
     feature_root->dump();
-    feature_root->compute_features(dag, params, target, sites, 1, 1, nullptr, nullptr, *feature_root, GPULoopInfo(feature_root.get()), true, total_shared_mem_alloc_sizes, nullptr, nullptr, nullptr, features, stats, verbose);
+    feature_root->compute_features(dag,
+                                   params,
+                                   target,
+                                   sites,
+                                   1,
+                                   1,
+                                   nullptr,
+                                   nullptr,
+                                   *feature_root,
+                                   GPULoopInfo(feature_root.get()),
+                                   true,
+                                   total_shared_mem_alloc_sizes,
+                                   nullptr,
+                                   nullptr,
+                                   nullptr,
+                                   features,
+                                   stats,
+                                   verbose);
 
     stats.featurization_time += timer.elapsed();
     ++stats.num_featurizations;
@@ -509,15 +547,17 @@ bool State::compute_featurization(const FunctionDAG &dag, const Anderson2021Para
     for (const auto &n : dag.nodes) {
         if (sites.get(&(n.stages[0])).produce == nullptr) {
             internal_assert(!features->contains(&(n.stages[0])))
-                << "Somehow an input or unscheduled node ended up in the featurization: "
-                << n.func.name() << "\n";
+                << "Somehow an input or unscheduled node ended up in the featurization: " << n.func.name() << "\n";
         }
     }
 
     return true;
 }
 
-void State::save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const {
+void State::save_featurization(const FunctionDAG &dag,
+                               const Anderson2021Params &params,
+                               const Target &target,
+                               std::ostream &out) const {
     StageMap<ScheduleFeatures> features;
     Statistics stats;
     compute_featurization(dag, params, target, &features, stats);
@@ -547,7 +587,8 @@ void State::save_featurization(const FunctionDAG &dag, const Anderson2021Params
     }
 }
 
-bool State::contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at, const IntrusivePtr<const LoopNest> &parent) const {
+bool State::contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at,
+                              const IntrusivePtr<const LoopNest> &parent) const {
     for (const auto &c : parent->children) {
         if (!c->store_at.empty()) {
             return true;
@@ -594,7 +635,8 @@ bool State::exceeds_serial_extents_limit(const Target &target) const {
     return root->exceeds_serial_extents_limit(target, nullptr, false);
 }
 
-int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const {
+int64_t State::get_shared_mem_alloc_size(const LoopNest *block,
+                                         const LoopNest *loop) const {
     int64_t result = 0;
 
     if (loop->gpu_label == GPU_parallelism::Thread) {
@@ -622,7 +664,8 @@ int64_t State::get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *
     return result;
 }
 
-bool State::exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const {
+bool State::exceeds_shared_memory_limit(const Anderson2021Params &params,
+                                        const Target &target) const {
     if (!target.has_gpu_feature()) {
         return false;
     }
@@ -644,7 +687,8 @@ bool State::exceeds_shared_memory_limit(const Anderson2021Params &params, const
     return false;
 }
 
-bool State::exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const {
+bool State::exceeds_local_memory_limit(const Anderson2021Params &params,
+                                       const Target &target) const {
     if (!target.has_gpu_feature()) {
         return false;
     }
@@ -662,7 +706,12 @@ bool State::exceeds_local_memory_limit(const Anderson2021Params &params, const T
     return false;
 }
 
-bool State::calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose) {
+bool State::calculate_cost(const FunctionDAG &dag,
+                           const Anderson2021Params &params,
+                           const Target &target,
+                           CostModel *cost_model,
+                           Statistics &stats,
+                           bool verbose) {
     Timer timer;
     if (!root->has_valid_thread_extents()) {
         Filter(root.get()) << "Invalid thread extents\n";
@@ -778,7 +827,11 @@ void State::print_compute_locations() const {
     aslog(1) << "END compute locations\n";
 }
 
-void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents, const vector<int> &constant_extents) const {
+void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state,
+                            Stage &stage,
+                            const vector<VarOrRVar> &parallel_vars,
+                            const vector<int64_t> &parallel_extents,
+                            const vector<int> &constant_extents) const {
     if (parallel_vars.empty() || parallel_extents.empty()) {
         return;
     }
@@ -848,12 +901,16 @@ void State::fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c
     }
 }
 
-void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents) const {
+void State::mark_gpu_blocks(LoopNest::StageScheduleState *state,
+                            Stage &stage,
+                            const vector<VarOrRVar> &parallel_vars,
+                            const vector<int64_t> &parallel_extents) const {
     int max_blocks[3] = {2147483647, 65535, 65535};
     uint8_t n_loops_tagged_gpu_blocks = 0;
 
     for (const auto &v : parallel_vars) {
-        if (n_loops_tagged_gpu_blocks >= 3 || parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) {
+        if (n_loops_tagged_gpu_blocks >= 3 ||
+            parallel_extents[n_loops_tagged_gpu_blocks] > max_blocks[n_loops_tagged_gpu_blocks]) {
             break;
         }
 
@@ -867,7 +924,10 @@ void State::mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, c
     }
 }
 
-bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set<std::string> &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const {
+bool State::mark_gpu_threads(LoopNest::StageScheduleState *state,
+                             Stage &stage,
+                             std::unordered_set<std::string> &new_serial_vars,
+                             std::ostringstream &staged_funcs_schedule_source) const {
     uint8_t num_loops_tagged_gpu_thread = 0;
     int64_t total_threads = 1;
     int max_threads[3] = {1024, 1024, 64};
@@ -879,7 +939,9 @@ bool State::mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage,
             continue;
         }
 
-        if (num_loops_tagged_gpu_thread >= 3 || total_threads >= MAX_THREADS_PER_BLOCK || v.extent > max_threads[num_loops_tagged_gpu_thread]) {
+        if (num_loops_tagged_gpu_thread >= 3 ||
+            total_threads >= MAX_THREADS_PER_BLOCK ||
+            v.extent > max_threads[num_loops_tagged_gpu_thread]) {
             break;
         }
 
@@ -1147,7 +1209,10 @@ void State::apply_schedule(const FunctionDAG &dag, const Anderson2021Params &par
                     }
                 }
 
-                bool thread_loop_exists = mark_gpu_threads(p.second.get(), stage, new_serial_vars, staged_funcs_schedule_source);
+                bool thread_loop_exists = mark_gpu_threads(p.second.get(),
+                                                           stage,
+                                                           new_serial_vars,
+                                                           staged_funcs_schedule_source);
                 // The stage has no threads and no blocks. This is likely an update
                 // stage where the reduction is a serial loop
                 if (!thread_loop_exists && !has_enclosing_parallel) {
diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h
index 846c895a4c53..c2b0371dce3f 100644
--- a/src/autoschedulers/anderson2021/State.h
+++ b/src/autoschedulers/anderson2021/State.h
@@ -47,7 +47,10 @@ struct NoOpMutator {
 };
 
 template<typename PostCreateMutator>
-void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr<const LoopNest> &existing_loop_nest, const PostCreateMutator &post_create_mutator) {
+void deep_copy_loop_nest(LoopNest *new_loop_nest,
+                         const LoopNest *new_loop_nest_parent,
+                         const IntrusivePtr<const LoopNest> &existing_loop_nest,
+                         const PostCreateMutator &post_create_mutator) {
     new_loop_nest->copy_from(*existing_loop_nest);
 
     for (std::size_t i = 0, N = new_loop_nest->children.size(); i < N; ++i) {
@@ -59,8 +62,11 @@ void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_
     post_create_mutator(new_loop_nest);
 }
 
+using LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>;
+
 template<typename PostCreateMutator>
-LoopNest *deep_copy_loop_nest(const IntrusivePtr<const LoopNest> &loop_nest, const PostCreateMutator &post_create_mutator) {
+LoopNest *deep_copy_loop_nest(const IntrusivePtr<const LoopNest> &loop_nest,
+                              const PostCreateMutator &post_create_mutator) {
     LoopNest *new_loop_nest = new LoopNest;
     deep_copy_loop_nest(new_loop_nest, nullptr, loop_nest, post_create_mutator);
     return new_loop_nest;
@@ -86,11 +92,13 @@ struct State {
     uint64_t structural_hash(int depth) const;
 
     // Compute the parent and depth of every loop nest node
-    void compute_loop_nest_parents(map<const LoopNest *, pair<const LoopNest *, int>> &p,
-                                   const LoopNest *here, int depth) const;
+    void compute_loop_nest_parents(LoopNestMap &p,
+                                   const LoopNest *here,
+                                   int depth) const;
 
-    const LoopNest *deepest_common_ancestor(const map<const LoopNest *, pair<const LoopNest *, int>> &parent,
-                                            const LoopNest *a, const LoopNest *b) const;
+    const LoopNest *deepest_common_ancestor(const LoopNestMap &parent,
+                                            const LoopNest *a,
+                                            const LoopNest *b) const;
 
     // We use the post_create_mutator so that the loop nests can be modified
     // before they become IntrusivePtr<const LoopNest> as children and cannot be modified
@@ -122,15 +130,27 @@ struct State {
         void add_outer_thread_loops(LoopNest *loop_nest) const;
     };
 
-    IntrusivePtr<const LoopNest> get_root_for_features(const Anderson2021Params &params, const Target &target) const;
+    IntrusivePtr<const LoopNest> get_root_for_features(const Anderson2021Params &params,
+                                                       const Target &target) const;
 
-    void set_gpu_store_site(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop, LoopNest::Sites &site) const;
+    void set_gpu_store_site(const LoopNestMap &parent,
+                            const LoopNest *loop,
+                            LoopNest::Sites &site) const;
 
-    bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, StageMap<ScheduleFeatures> *features, Statistics &stats, bool verbose = false) const;
+    bool compute_featurization(const FunctionDAG &dag,
+                               const Anderson2021Params &params,
+                               const Target &target,
+                               StageMap<ScheduleFeatures> *features,
+                               Statistics &stats,
+                               bool verbose = false) const;
 
-    void save_featurization(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, std::ostream &out) const;
+    void save_featurization(const FunctionDAG &dag,
+                            const Anderson2021Params &params,
+                            const Target &target,
+                            std::ostream &out) const;
 
-    bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at, const IntrusivePtr<const LoopNest> &parent) const;
+    bool contains_store_at(const set<const FunctionDAG::Node *> &outermost_store_at,
+                           const IntrusivePtr<const LoopNest> &parent) const;
 
     // For GPU, only allow store_at root or inside the outermost loop nest. Any
     // store_ats further in will be hoisted and expanded, increasing the
@@ -141,13 +161,21 @@ struct State {
 
     bool exceeds_serial_extents_limit(const Target &target) const;
 
-    int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const;
+    int64_t get_shared_mem_alloc_size(const LoopNest *block,
+                                      const LoopNest *loop) const;
 
-    bool exceeds_shared_memory_limit(const Anderson2021Params &params, const Target &target) const;
+    bool exceeds_shared_memory_limit(const Anderson2021Params &params,
+                                     const Target &target) const;
 
-    bool exceeds_local_memory_limit(const Anderson2021Params &params, const Target &target) const;
+    bool exceeds_local_memory_limit(const Anderson2021Params &params,
+                                    const Target &target) const;
 
-    bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose = false);
+    bool calculate_cost(const FunctionDAG &dag,
+                        const Anderson2021Params &params,
+                        const Target &target,
+                        CostModel *cost_model,
+                        Statistics &stats,
+                        bool verbose = false);
 
     // Make a child copy of this state. The loop nest is const (we
     // make mutated copies of it, rather than mutating it), so we can
@@ -159,25 +187,43 @@ struct State {
 
     void print_compute_locations() const;
 
-    void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents, const vector<int> &constant_extents) const;
+    void fuse_gpu_blocks(LoopNest::StageScheduleState *state,
+                         Stage &stage,
+                         const vector<VarOrRVar> &parallel_vars,
+                         const vector<int64_t> &parallel_extents,
+                         const vector<int> &constant_extents) const;
 
-    void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector<VarOrRVar> &parallel_vars, const vector<int64_t> &parallel_extents) const;
+    void mark_gpu_blocks(LoopNest::StageScheduleState *state,
+                         Stage &stage,
+                         const vector<VarOrRVar> &parallel_vars,
+                         const vector<int64_t> &parallel_extents) const;
 
-    bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set<std::string> &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const;
+    bool mark_gpu_threads(LoopNest::StageScheduleState *state,
+                          Stage &stage,
+                          std::unordered_set<std::string> &new_serial_vars,
+                          std::ostringstream &staged_funcs_schedule_source) const;
 
     bool can_fuse_gpu(const vector<int64_t> &parallel_extents) const;
 
     // Apply the schedule represented by this state to a Halide
     // Pipeline. Also generate source code for the schedule for the
     // user to copy-paste to freeze this schedule as permanent artifact.
-    void apply_schedule(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target);
+    void apply_schedule(const FunctionDAG &dag,
+                        const Anderson2021Params &params,
+                        const Target &target);
 
     bool should_always_consider_inline(const FunctionDAG::Node *node) const;
     void add_to_always_consider_inline_options(const FunctionDAG::Node *node);
     void update_always_consider_inline_options(const FunctionDAG::Node *node);
 
-    const LoopNest *deepest_valid_compute_location(const Anderson2021Params &params, const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
-    int64_t total_loop_extents_of_ancestors(const map<const LoopNest *, pair<const LoopNest *, int>> &parent, const LoopNest *loop) const;
+    const LoopNest *deepest_valid_compute_location(const Anderson2021Params &params,
+                                                   const LoopNestMap &parent,
+                                                   const FunctionDAG::Node &node,
+                                                   const LoopNest *loop,
+                                                   const LoopNest *root,
+                                                   StageMap<int64_t> &total_shared_mem_alloc_sizes) const;
+    int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent,
+                                            const LoopNest *loop) const;
 };
 
 // A priority queue of states, sorted according to increasing
diff --git a/src/autoschedulers/anderson2021/Statistics.h b/src/autoschedulers/anderson2021/Statistics.h
index a42717f75609..f725129d40ef 100644
--- a/src/autoschedulers/anderson2021/Statistics.h
+++ b/src/autoschedulers/anderson2021/Statistics.h
@@ -20,7 +20,8 @@ struct ScopedStatistic {
     std::string msg;
 
     ScopedStatistic(const T &value, const std::string &msg)
-        : value{value}, msg{msg} {
+        : value{value},
+          msg{msg} {
     }
 
     ~ScopedStatistic() {
@@ -33,7 +34,8 @@ struct ScopedTimer {
     std::string msg;
 
     explicit ScopedTimer(const std::string &msg)
-        : start{Clock::now()}, msg{msg} {
+        : start{Clock::now()},
+          msg{msg} {
         aslog(1) << "Start: " << msg << "\n";
     }
 
diff --git a/src/autoschedulers/anderson2021/Tiling.cpp b/src/autoschedulers/anderson2021/Tiling.cpp
index 780151e6b9ec..4d4006d757e5 100644
--- a/src/autoschedulers/anderson2021/Tiling.cpp
+++ b/src/autoschedulers/anderson2021/Tiling.cpp
@@ -15,7 +15,8 @@ bool all_ones(const std::vector<int64_t> &nums) {
     return true;
 }
 
-bool equal_to_existing_size(const std::vector<int64_t> &s, const std::vector<int64_t> &nums) {
+bool equal_to_existing_size(const std::vector<int64_t> &s,
+                            const std::vector<int64_t> &nums) {
     for (size_t i = 0; i < s.size(); ++i) {
         if (s[i] != nums[i]) {
             return false;
@@ -24,7 +25,8 @@ bool equal_to_existing_size(const std::vector<int64_t> &s, const std::vector<int
     return true;
 }
 
-std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s, int d,
+std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s,
+                                                          int d,
                                                           int last_d,
                                                           int vectorized_index,
                                                           const std::vector<int> &vec_dim_serial_sizes,
@@ -35,7 +37,13 @@ std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int6
         result.emplace_back();
     } else {
         std::vector<std::vector<int64_t>> v;
-        v = generate_serial_tilings(s, d - 1, last_d, vectorized_index, vec_dim_serial_sizes, filter_small_outer_extents, allow_inner_ones);
+        v = generate_serial_tilings(s,
+                                    d - 1,
+                                    last_d,
+                                    vectorized_index,
+                                    vec_dim_serial_sizes,
+                                    filter_small_outer_extents,
+                                    allow_inner_ones);
         for (auto t : v) {
             t.push_back(0);
             bool used_full_extent = false;
@@ -90,7 +98,9 @@ std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int6
 // producer-consumer fusion, or tiling for parallelism.
 // inner_sizes is optional vector of fixed sizes to choose from for inner loop.
 // used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop
-std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s, int d, int factor,
+std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s,
+                                                   int d,
+                                                   int factor,
                                                    bool allow_splits,
                                                    const std::vector<int> &inner_sizes) {
     std::vector<std::vector<int64_t>> result;
@@ -199,7 +209,9 @@ std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s
 
 // Moves vectorized dimension first and also removes dimensions with size 1
 // to reflect actual thread dimensions when loop nests are lowered
-void lowered_dims(const std::vector<int64_t> &size, int vector_loop_i, std::vector<int64_t> &lowered_size) {
+void lowered_dims(const std::vector<int64_t> &size,
+                  int vector_loop_i,
+                  std::vector<int64_t> &lowered_size) {
     if (vector_loop_i >= 0 && size[vector_loop_i] > 1) {
         lowered_size.push_back(size[vector_loop_i]);
     }
@@ -238,12 +250,20 @@ std::vector<std::vector<int64_t>> generate_gpu_tilings(const std::vector<std::ve
         }
 
         std::vector<std::vector<int64_t>> v;
-        v = generate_gpu_tilings(stage_sizes, pure_dims, max_s, d - 1, vectorized_indices, serial_inner, is_compute_root_stage);
+        v = generate_gpu_tilings(stage_sizes,
+                                 pure_dims,
+                                 max_s,
+                                 d - 1,
+                                 vectorized_indices,
+                                 serial_inner,
+                                 is_compute_root_stage);
 
         for (auto t : v) {
-            enum validity { serial_count_err,
-                            thread_count_err,
-                            valid_tiling };
+            enum validity {
+                serial_count_err,
+                thread_count_err,
+                valid_tiling
+            };
 
             // helper function detects whether tiling is legal: cannot exceed max thread count,
             // have more than three dimensions with ext > 1, or result in large serial loops
@@ -314,11 +334,14 @@ std::vector<std::vector<int64_t>> generate_gpu_tilings(const std::vector<std::ve
                     break;
                 }
                 // reject if inner exceeds hardware thread limit
-                if ((d == vectorized_indices[0] && threads_ext > max_threads_extent) || (d != vectorized_indices[0] && threads_ext > 16)) {
+                if ((d == vectorized_indices[0] && threads_ext > max_threads_extent) ||
+                    (d != vectorized_indices[0] && threads_ext > 16)) {
                     break;
                 }
                 int64_t other_ext = (stage_sizes[0][d] + threads_ext - 1) / threads_ext;
-                if (d != vectorized_indices[0] && threads_ext > 1 && threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) {
+                if (d != vectorized_indices[0] &&
+                    threads_ext > 1 &&
+                    threads_ext * other_ext * 7 > stage_sizes[0][d] * 8) {
                     break;
                 }
                 t.back() = threads_ext;
diff --git a/src/autoschedulers/anderson2021/Tiling.h b/src/autoschedulers/anderson2021/Tiling.h
index fb82672b2e06..b1e711f93ad0 100644
--- a/src/autoschedulers/anderson2021/Tiling.h
+++ b/src/autoschedulers/anderson2021/Tiling.h
@@ -10,9 +10,11 @@ namespace Autoscheduler {
 
 bool all_ones(const std::vector<int64_t> &nums);
 
-bool equal_to_existing_size(const std::vector<int64_t> &s, const std::vector<int64_t> &nums);
+bool equal_to_existing_size(const std::vector<int64_t> &s,
+                            const std::vector<int64_t> &nums);
 
-std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s, int d,
+std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int64_t> &s,
+                                                          int d,
                                                           int last_d,
                                                           int vectorized_index,
                                                           const std::vector<int> &vec_dim_serial_sizes,
@@ -27,13 +29,17 @@ std::vector<std::vector<int64_t>> generate_serial_tilings(const std::vector<int6
 // producer-consumer fusion, or tiling for parallelism.
 // inner_sizes is optional vector of fixed sizes to choose from for inner loop.
 // used for GPU schedules when we split a 'none' loop into a parallel loop and a serial loop
-std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s, int d, int factor,
+std::vector<std::vector<int64_t>> generate_tilings(const std::vector<int64_t> &s,
+                                                   int d,
+                                                   int factor,
                                                    bool allow_splits,
                                                    const std::vector<int> &inner_sizes = std::vector<int>());
 
 /** moves vectorized dimension first and also removes dimensions with size 1
     to reflect actual thread dimensions when loop nests are lowered **/
-void lowered_dims(const std::vector<int64_t> &size, int vector_loop_i, std::vector<int64_t> &lowered_size);
+void lowered_dims(const std::vector<int64_t> &size,
+                  int vector_loop_i,
+                  std::vector<int64_t> &lowered_size);
 
 // creates tilings for gpu threads loops.
 // Innermost thread loop is always the vectorized dim and its extent is a multiple of 32.
diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp
index 8a13dc176b37..6dfeb0dc62b5 100644
--- a/src/autoschedulers/anderson2021/cost_model_generator.cpp
+++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp
@@ -43,7 +43,8 @@ struct ModelWeight<true> : public GeneratorInput<Buffer<float>> {
     GeneratorOutput<Buffer<float>> grad;
 
     ModelWeight(const std::string &name, int dim)
-        : GeneratorInput<Buffer<float>>(name, dim), grad("updated_" + name, dim + 1) {
+        : GeneratorInput<Buffer<float>>(name, dim),
+          grad("updated_" + name, dim + 1) {
     }
     void backprop(const Derivative &d, Expr learning_rate, const Expr &timestep) {
         std::vector<Expr> args(dimensions() + 1);
diff --git a/src/autoschedulers/anderson2021/retrain_cost_model.cpp b/src/autoschedulers/anderson2021/retrain_cost_model.cpp
index 89ef78bebff0..bb0b6ece7245 100644
--- a/src/autoschedulers/anderson2021/retrain_cost_model.cpp
+++ b/src/autoschedulers/anderson2021/retrain_cost_model.cpp
@@ -266,10 +266,9 @@ size_t load_samples(map<int, PipelineSample> &training_set, map<int, PipelineSam
 
         uint64_t schedule_hash = 0;
         for (size_t i = 0; i < num_stages; i++) {
-            schedule_hash =
-                hash_floats(schedule_hash,
-                            &scratch[i * features_per_stage],
-                            &scratch[i * features_per_stage + head2_w]);
+            schedule_hash = hash_floats(schedule_hash,
+                                        &scratch[i * features_per_stage],
+                                        &scratch[i * features_per_stage + head2_w]);
         }
 
         uint64_t hash = flags.partition_schedules ? schedule_hash : p.pipeline_hash;
@@ -629,7 +628,8 @@ int main(int argc, char **argv) {
                                     good++;
                                 } else {
                                     if (train) {
-                                        float badness = (sched.second.runtimes[0] - ref.runtimes[0]) * (ref.prediction[model] - sched.second.prediction[model]);
+                                        float badness = (sched.second.runtimes[0] - ref.runtimes[0]) *
+                                                        (ref.prediction[model] - sched.second.prediction[model]);
                                         badness /= (ref.runtimes[0] * ref.runtimes[0]);
                                         if (badness > worst_inversion.badness) {
                                             worst_inversion.pipeline_id = p.pipeline_id;