Skip to content

Commit

Permalink
Parallelize gate argument outside the expression tree. (#19)
Browse files Browse the repository at this point in the history
* Parallelize gate argument outside the expression tree.

* Optimize memory consumption a bit.

---------

Co-authored-by: Martun Karapetyan <martun@nil.foundation>
  • Loading branch information
martun and Martun Karapetyan authored Jul 30, 2024
1 parent 404077b commit bcaebf3
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 110 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,18 @@ namespace nil {
constexpr static const std::size_t argument_size = 1;

static inline void build_variable_value_map(
const math::expression<polynomial_dfs_variable_type>& expr,
const math::expression<variable_type>& expr,
const plonk_polynomial_dfs_table<FieldType> &assignments,
std::shared_ptr<math::evaluation_domain<FieldType>> domain,
std::size_t extended_domain_size,
std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type>& variable_values_out) {
std::unordered_map<variable_type, polynomial_dfs_type>& variable_values_out) {

std::unordered_map<polynomial_dfs_variable_type, size_t> variable_counts;
std::unordered_map<variable_type, size_t> variable_counts;

std::vector<polynomial_dfs_variable_type> variables;
std::vector<variable_type> variables;

math::expression_for_each_variable_visitor<polynomial_dfs_variable_type> visitor(
[&variable_counts, &variables, &variable_values_out](const polynomial_dfs_variable_type& var) {
math::expression_for_each_variable_visitor<variable_type> visitor(
[&variable_counts, &variables, &variable_values_out](const variable_type& var) {
// Create the structure of the map so we can change the values later.
if (variable_counts[var] == 0) {
variables.push_back(var);
Expand All @@ -107,12 +107,14 @@ namespace nil {

parallel_for(0, variables.size(),
[&variables, &variable_values_out, &assignments, &domain, &extended_domain, extended_domain_size](std::size_t i) {
const auto& var = variables[i];
// We may have variable values in required sizes in some cases.
if (variable_values_out[var].size() == extended_domain_size)
return;
const variable_type& var = variables[i];

polynomial_dfs_type assignment = assignments.get_variable_value(var, domain);
// Convert the variable to polynomial_dfs variable type.
polynomial_dfs_variable_type var_dfs(var.index, var.rotation, var.relative,
static_cast<typename polynomial_dfs_variable_type::column_type>(
static_cast<std::uint8_t>(var.type)));

polynomial_dfs_type assignment = assignments.get_variable_value(var_dfs, domain);

// In parallel version we always resize the assignment poly, it's better for parallelization.
// if (count > 1) {
Expand Down Expand Up @@ -152,11 +154,7 @@ namespace nil {
degree_limits.push_back(max_degree / 2);
extended_domain_sizes.push_back(max_domain_size / 2);

std::vector<math::expression<polynomial_dfs_variable_type>> expressions(extended_domain_sizes.size());

// Only in parallel version we store the subexpressions of each expression and ignore the cache.
std::vector<std::vector<math::expression<polynomial_dfs_variable_type>>> subexpressions(extended_domain_sizes.size());

std::vector<math::expression<variable_type>> expressions(extended_domain_sizes.size());
auto theta_acc = FieldType::value_type::one();

// Every constraint has variable type 'variable_type', but we want it to use
Expand All @@ -170,28 +168,10 @@ namespace nil {
const auto& gates = constraint_system.gates();

for (const auto& gate: gates) {
std::vector<math::expression<polynomial_dfs_variable_type>> gate_results(extended_domain_sizes.size());

// We will split gates into parts especially for zkEVM circuit, since there is only 1 large gate with
// 683 constraints. Will split it into 24 parts, ~32 constraints each.
// This will mean our code will multiply by selector 16 times, instead of just once. But this is
// much better that losing parallelization. We do not want to re-write the whole code to try parallelize
// each gate compatation separately. This will not harm circuits with smaller number of terms much.
std::vector<math::expression<polynomial_dfs_variable_type>> gate_parts(extended_domain_sizes.size());
std::vector<std::size_t> gate_parts_constaint_counts(extended_domain_sizes.size());


// This parameter can be tuned based on the circuit and the number of cores of the server on which the proofs
// are generated. On the current zkEVM circuit this value is optimal based on experiments.
const std::size_t constraint_limit = 16;


auto selector = polynomial_dfs_variable_type(
gate.selector_index, 0, false, polynomial_dfs_variable_type::column_type::selector);

std::vector<math::expression<variable_type>> gate_results(extended_domain_sizes.size());
for (std::size_t constraint_idx = 0; constraint_idx < gate.constraints.size(); ++constraint_idx) {
const auto& constraint = gate.constraints[constraint_idx];
auto next_term = converter.convert(constraint) * value_type_to_polynomial_dfs(theta_acc);
auto next_term = constraint * theta_acc;

theta_acc *= theta;
// +1 stands for the selector multiplication.
Expand All @@ -200,57 +180,46 @@ namespace nil {
// Whatever the degree of term is, add it to the maximal degree expression.
if (degree_limits[i] >= constraint_degree || i == 0) {
gate_results[i] += next_term;
gate_parts[i] += next_term;
gate_parts_constaint_counts[i]++;

// If we already have constraint_limit constaints in the gate_parts[i], add it to the 'subexpressions'.
if (gate_parts_constaint_counts[i] == constraint_limit) {
subexpressions[i].push_back(gate_parts[i] * selector);
gate_parts[i] = math::expression<polynomial_dfs_variable_type>();
gate_parts_constaint_counts[i] = 0;
}
break;
}

}
}

auto selector = variable_type(
gate.selector_index, 0, false, variable_type::column_type::selector);
for (size_t i = 0; i < extended_domain_sizes.size(); ++i) {
// Only in parallel version we store the subexpressions of each expression and ignore the cache.
expressions[i] += gate_results[i] * selector;
if (gate_parts_constaint_counts[i] != 0)
subexpressions[i].push_back(gate_parts[i] * selector);
}
}

std::array<polynomial_dfs_type, argument_size> F;

std::vector<polynomial_dfs_type> F_0_parts(extended_domain_sizes.size());
parallel_for(0, extended_domain_sizes.size(),
[&subexpressions, &extended_domain_sizes, &F_0_parts, &original_domain, &column_polynomials, &expressions](std::size_t i) {
std::unordered_map<polynomial_dfs_variable_type, polynomial_dfs_type> variable_values;
F[0] = polynomial_dfs_type::zero();
for (std::size_t i = 0; i < extended_domain_sizes.size(); ++i) {
std::unordered_map<variable_type, polynomial_dfs_type> variable_values;

build_variable_value_map(expressions[i], column_polynomials, original_domain,
extended_domain_sizes[i], variable_values);

std::vector<polynomial_dfs_type> subvalues(subexpressions[i].size());
parallel_for(0, subexpressions[i].size(),
[&subexpressions, &variable_values, &extended_domain_sizes, &subvalues, i](std::size_t subexpression_index) {
// Only in parallel version we store the subexpressions of each expression and ignore the cache,
// not using "cached_expression_evaluator".
math::expression_evaluator<polynomial_dfs_variable_type> evaluator(
subexpressions[i][subexpression_index],
[&assignments=variable_values, domain_size=extended_domain_sizes[i]]
(const polynomial_dfs_variable_type &var) -> const polynomial_dfs_type& {
return assignments[var];
});
subvalues[subexpression_index] = evaluator.evaluate();
}, ThreadPool::PoolLevel::HIGH);
polynomial_dfs_type result(extended_domain_sizes[i] - 1, extended_domain_sizes[i]);
wait_for_all(parallel_run_in_chunks<void>(
extended_domain_sizes[i],
[&variable_values, &extended_domain_sizes, &result, &expressions, i]
(std::size_t begin, std::size_t end) {
for (std::size_t j = begin; j < end; ++j) {
// Don't use cache here. In practice it's slower to maintain the cache
// than to re-compute the subexpression value when value type is field element.
math::expression_evaluator<variable_type> evaluator(
expressions[i],
[&assignments=variable_values, j]
(const variable_type &var) -> const typename FieldType::value_type& {
return assignments[var][j];
});
result[j] = evaluator.evaluate();
}
}, ThreadPool::PoolLevel::HIGH));

F_0_parts[i] = polynomial_sum<FieldType>(std::move(subvalues));
}, ThreadPool::PoolLevel::LASTPOOL);

F[0] += polynomial_sum<FieldType>(std::move(F_0_parts));
F[0] += result;
};
F[0] *= mask_polynomial;
return F;
}
Expand Down
Loading

0 comments on commit bcaebf3

Please sign in to comment.