Skip to content

Commit

Permalink
Chore: folding in parallel with witness generation for NIVC (#1011)
Browse files Browse the repository at this point in the history
* MultiFrames with PC=0 have their witnesses cached just like in the IVC pipeline
* MultiFrames with PC!=0 have their witnesses cached in parallel due to limited size
  (agnostic to RC) and poor internal parallelism
  • Loading branch information
arthurpaulino authored Jan 10, 2024
1 parent b278e9d commit 73badc6
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 9 deletions.
4 changes: 4 additions & 0 deletions src/lem/multiframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,10 @@ impl<'a, F: LurkField, C: Coprocessor<F>> MultiFrame<'a, F, C> {
.skip_while(|f| f.input == f.output && stop_cond(&f.output))
.count()
}

pub fn program_counter(&self) -> usize {
self.pc
}
}

impl CEKState<Ptr> for Vec<Ptr> {
Expand Down
2 changes: 0 additions & 2 deletions src/proof/nova.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#![allow(non_snake_case)]

use abomonation::Abomonation;
use bellpepper_core::{num::AllocatedNum, ConstraintSystem};
use ff::PrimeField;
Expand Down
75 changes: 68 additions & 7 deletions src/proof/supernova.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#![allow(non_snake_case)]

use abomonation::Abomonation;
use ff::PrimeField;
use nova::{
Expand All @@ -15,11 +13,17 @@ use nova::{
Engine,
},
};
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
use std::{marker::PhantomData, ops::Index, sync::Arc};
use std::{
marker::PhantomData,
ops::Index,
sync::{Arc, Mutex},
};
use tracing::info;

use crate::{
config::lurk_config,
coprocessor::Coprocessor,
error::ProofError,
eval::lang::Lang,
Expand Down Expand Up @@ -176,7 +180,7 @@ where
pp: &PublicParams<F, C1LEM<'a, F, C>>,
z0: &[F],
steps: Vec<C1LEM<'a, F, C>>,
_store: &'a Store<F>,
store: &'a Store<F>,
_reduction_count: usize,
_lang: Arc<Lang<F, C>>,
) -> Result<Self, ProofError> {
Expand All @@ -185,16 +189,18 @@ where
let z0_primary = z0;
let z0_secondary = Self::z0_secondary();

for (i, step) in steps.iter().enumerate() {
let mut prove_step = |i: usize, step: &C1LEM<'a, F, C>| {
info!("prove_recursively, step {i}");

let secondary_circuit = step.secondary_circuit();

let mut recursive_snark = recursive_snark_option.clone().unwrap_or_else(|| {
info!("RecursiveSnark::new {i}");
RecursiveSNARK::new(
&pp.pp,
step,
step,
&step.secondary_circuit(),
&secondary_circuit,
z0_primary,
&z0_secondary,
)
Expand All @@ -204,10 +210,65 @@ where
info!("prove_step {i}");

recursive_snark
.prove_step(&pp.pp, step, &step.secondary_circuit())
.prove_step(&pp.pp, step, &secondary_circuit)
.unwrap();

recursive_snark_option = Some(recursive_snark);
};

if lurk_config(None, None)
.perf
.parallelism
.recursive_steps
.is_parallel()
{
let cc = steps
.into_iter()
.map(|mf| (mf.program_counter() == 0, Mutex::new(mf)))
.collect::<Vec<_>>();

crossbeam::thread::scope(|s| {
s.spawn(|_| {
// Skip the very first circuit's witness, so `prove_step` can begin immediately.
// That circuit's witness will not be cached and will just be computed on-demand.

// There are many MultiFrames with PC = 0, each with several inner frames and heavy internal
// paralellism for witness generation. So we do it like on Nova's pipeline.
cc.iter()
.skip(1)
.filter(|(is_zero_pc, _)| *is_zero_pc)
.for_each(|(_, mf)| {
mf.lock()
.unwrap()
.cache_witness(store)
.expect("witness caching failed");
});

// There shouldn't be as many MultiFrames with PC != 0 and they only have one inner frame, each with
// poor internal parallelism for witness generation, so we can generate their witnesses in parallel.
// This is mimicking the behavior we had in the Nova pipeline before #941 so...
// TODO: once we have robust benchmarking for NIVC, we should test whether merging this loop with
// the non-parallel one above (and getting rid of the filters) is better
cc.par_iter()
.skip(1)
.filter(|(is_zero_pc, _)| !*is_zero_pc)
.for_each(|(_, mf)| {
mf.lock()
.unwrap()
.cache_witness(store)
.expect("witness caching failed");
});
});

for (i, (_, step)) in cc.iter().enumerate() {
prove_step(i, &step.lock().unwrap());
}
})
.unwrap()
} else {
for (i, step) in steps.iter().enumerate() {
prove_step(i, step);
}
}

// This probably should be made unnecessary.
Expand Down

1 comment on commit 73badc6

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks

Table of Contents

Overview

This benchmark report shows the Fibonacci GPU benchmark.
NVIDIA L4
Intel(R) Xeon(R) CPU @ 2.20GHz
125.78 GB RAM
Workflow run: https://github.com/lurk-lab/lurk-rs/actions/runs/7468383290

Benchmark Results

LEM Fibonacci Prove - rc = 100

fib-ref=b278e9d43b10ba32d04e8e963c45db32b2d8d4a8 fib-ref=73badc6d513fbe3b2718bc1e18cf166de0dad127
num-100 1.73 s (✅ 1.00x) 1.73 s (✅ 1.00x faster)
num-200 3.33 s (✅ 1.00x) 3.32 s (✅ 1.00x faster)

LEM Fibonacci Prove - rc = 600

fib-ref=b278e9d43b10ba32d04e8e963c45db32b2d8d4a8 fib-ref=73badc6d513fbe3b2718bc1e18cf166de0dad127
num-100 1.96 s (✅ 1.00x) 1.95 s (✅ 1.01x faster)
num-200 3.35 s (✅ 1.00x) 3.34 s (✅ 1.00x faster)

Made with criterion-table

Please sign in to comment.