Skip to content

Commit

Permalink
add example to measure throughput (raw and adjusted) across different…
Browse files Browse the repository at this point in the history
… rcs and number of folding steps (#908)
  • Loading branch information
arthurpaulino authored Dec 21, 2023
1 parent 6af8086 commit 510d704
Show file tree
Hide file tree
Showing 2 changed files with 231 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ criterion = "0.5"
expect-test = "1.4.1"
hex = "0.4.3"
pprof = { version = "0.13" }
statrs = "0.16.0"
structopt = { version = "0.3", default-features = false }
tap = "1.0.1"
tempfile = { workspace = true }
Expand Down
230 changes: 230 additions & 0 deletions examples/tp_table.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
use anyhow::{anyhow, Result};
use ascii_table::{Align, AsciiTable};
use criterion::black_box;
use lurk::{
eval::lang::{Coproc, Lang},
lem::{eval::evaluate, multiframe::MultiFrame, store::Store},
proof::{
nova::{public_params, NovaProver, PublicParams},
Prover,
},
};
use num_traits::ToPrimitive;
use pasta_curves::pallas::Scalar as Fr;
use statrs::statistics::Statistics;
use std::{cell::OnceCell, sync::Arc, time::Instant};

const PROGRAM: &str = "(letrec ((loop (lambda (k) (begin
;; fibonacci from lurk-lib
(letrec ((next (lambda (a b n target)
(if (eq n target)
a
(next b
(+ a b)
(+ 1 n)
target))))
(fib (next 0 1 0)))
(fib k))
;; to use commitment slots
(commit k)
;; < uses three bit decomp slots
(< 0 k)
;; u64 operator uses one bit decomp slot
(u64 5)
;; product between u64s uses one bit decomp slot
(* 2u64 3u64)
;; char operator uses one bit decomp slot
(char 97)
;; loop infinitely
(loop (+ k 1))))))
(loop 0))";

#[inline]
fn n_iters(n_folds: usize, rc: usize) -> usize {
(n_folds + 1) * rc
}

fn filter(elts: &[f64]) -> Vec<f64> {
let mean = elts.mean();
let sd = elts.std_dev();
let lower = &(mean - sd);
let upper = &(mean + sd);
let mut out = Vec::with_capacity(elts.len());
for e in elts.iter() {
if lower <= e && e <= upper {
out.push(*e)
}
}
out
}

fn analyze_raw(data: &[Vec<(usize, Vec<f64>)>], rc_vec: &[usize]) -> Vec<Vec<String>> {
let mut printed = Vec::with_capacity(data.len());
for (line_data, rc) in data.iter().zip(rc_vec) {
let mut line = Vec::with_capacity(line_data.len() + 1);
line.push(rc.to_string());
for (n_frames, durations) in line_data {
let tps = durations
.iter()
.map(|d| n_frames.to_f64().unwrap() / d)
.collect::<Vec<_>>();
line.push(format!("{:.2}±{:.2}", (&tps).mean(), tps.std_dev()));
}
printed.push(line);
}
printed
}

fn analyze_adj(data: &[Vec<(usize, Vec<f64>)>], rc_vec: &[usize]) -> Vec<Vec<String>> {
let mut printed = Vec::with_capacity(data.len());
for (line_data, rc) in data.iter().zip(rc_vec) {
let mut line = Vec::with_capacity(line_data.len() + 1);
line.push(rc.to_string());
let n_frames_duration_0 = OnceCell::new();
for (n_frames, durations) in &line_data[1..] {
let (n_frames_0, duration_0) = n_frames_duration_0.get_or_init(|| {
let (n_frames_0, durations_0) = &line_data[0];
let duration_0 = durations_0.mean();
(*n_frames_0, duration_0)
});
let tps = durations
.iter()
.map(|d| (n_frames - n_frames_0).to_f64().unwrap() / (d - duration_0))
.collect::<Vec<_>>();
line.push(format!("{:.2}±{:.2}", (&tps).mean(), tps.std_dev()));
}
printed.push(line);
}
printed
}

fn rc_env() -> Result<Vec<usize>> {
std::env::var("LURK_RC")
.map_err(|e| anyhow!("LURK_RC env var isn't set: {e}"))
.and_then(|rc| {
let vec: Result<Vec<usize>> = rc
.split(',')
.map(|rc| {
rc.parse::<usize>()
.map_err(|e| anyhow!("Failed to parse RC: {e}"))
})
.collect();
vec
})
}

fn n_folds_env() -> Result<usize> {
std::env::var("LURK_N_FOLDS")
.map_err(|e| anyhow!("LURK_N_FOLDS env var isn't set: {e}"))
.and_then(|n_folds| {
n_folds
.parse::<usize>()
.map_err(|e| anyhow!("Failed to parse N_FOLDS: {e}"))
})
}

/// This example prints adjusted and raw throughput tables whose dimensions are
/// the reduction count (vertical) and the number of folding steps (horizontal).
///
/// Example:
/// ```text
/// $ LURK_RC=8,16 LURK_N_FOLDS=2 cargo run --release --example tp_table
/// ...
/// Raw throughput
/// ┌────────────┬─────────────┬────────────┬────────────┐
/// │ rc\n_folds │ 0 │ 1 │ 2 │
/// ├────────────┼─────────────┼────────────┼────────────┤
/// │ 8 │ 85.61±3.15 │ 64.45±0.92 │ 56.22±0.68 │
/// │ 16 │ 111.48±2.30 │ 77.47±0.70 │ 67.54±0.61 │
/// └────────────┴─────────────┴────────────┴────────────┘
/// Adjusted throughput
/// ┌────────────┬────────────┬────────────┐
/// │ rc\n_folds │ 1 │ 2 │
/// ├────────────┼────────────┼────────────┤
/// │ 8 │ 51.72±1.19 │ 48.00±0.75 │
/// │ 16 │ 59.38±0.83 │ 56.42±0.63 │
/// └────────────┴────────────┴────────────┘
/// ```
/// The first table
fn main() {
let rc_vec = rc_env().unwrap_or_else(|_| vec![100]);
let max_n_folds = n_folds_env().unwrap_or(3);
let n_samples = 10;

let max_rc = rc_vec.iter().max().unwrap();

let limit = n_iters(max_n_folds, *max_rc);

let store = Store::<Fr>::default();
let program = store.read_with_default_state(PROGRAM).unwrap();

let frames = evaluate::<Fr, Coproc<Fr>>(None, program, &store, limit).unwrap();

let lang = Lang::<Fr, Coproc<Fr>>::new();
let lang_arc = Arc::new(lang.clone());

let mut data = Vec::with_capacity(rc_vec.len());

for rc in rc_vec.clone() {
let prover: NovaProver<'_, _, _, MultiFrame<'_, _, Coproc<Fr>>> =
NovaProver::new(rc, lang_arc.clone());
println!("Getting public params for rc={rc}");
// TODO: use cache once it's fixed
let pp: PublicParams<_, MultiFrame<'_, _, Coproc<Fr>>> =
public_params(rc, lang_arc.clone());
let n_folds_data = (0..=max_n_folds)
.map(|n_folds| {
let n_frames = n_iters(n_folds, rc);
let frames = &frames[..n_frames];
println!(" Proving for rc={rc} and n_folds={n_folds} ({n_frames} frames)");
let mut timings = Vec::with_capacity(n_samples);
for _ in 0..n_samples {
let start = Instant::now();
let result = prover.prove(&pp, frames, &store);
let _ = black_box(result);
let end = start.elapsed().as_secs_f64();
timings.push(end);
}
(n_frames, timings)
})
.collect::<Vec<_>>();
data.push(
n_folds_data
.into_iter()
.map(|(x, y)| (x, filter(&y)))
.collect(),
);
}

let mut table = AsciiTable::default();
table
.column(0)
.set_header("rc\\n_folds")
.set_align(Align::Right);
(0..=max_n_folds).for_each(|i| {
table
.column(i + 1)
.set_header(i.to_string())
.set_align(Align::Right);
});

println!("Raw throughput");
table.print(analyze_raw(&data, &rc_vec));

if max_n_folds > 0 {
let mut table = AsciiTable::default();
table
.column(0)
.set_header("rc\\n_folds")
.set_align(Align::Right);
(1..=max_n_folds).for_each(|i| {
table
.column(i)
.set_header(i.to_string())
.set_align(Align::Right);
});

println!("Adjusted throughput");
table.print(analyze_adj(&data, &rc_vec));
}
}

1 comment on commit 510d704

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks

Table of Contents

Overview

This benchmark report shows the Fibonacci GPU benchmark.
NVIDIA L4
Intel(R) Xeon(R) CPU @ 2.20GHz
125.78 GB RAM
Workflow run: https://github.com/lurk-lab/lurk-rs/actions/runs/7294053212

Benchmark Results

LEM Fibonacci Prove - rc = 100

fib-ref=6af808640caa7a23bfaae652b1af33470f5e7cb9 fib-ref=510d7042990844760d97d65c7e6c7ab75f934630
num-100 2.36 s (✅ 1.00x) 2.37 s (✅ 1.00x slower)
num-200 4.61 s (✅ 1.00x) 4.63 s (✅ 1.00x slower)

LEM Fibonacci Prove - rc = 600

fib-ref=6af808640caa7a23bfaae652b1af33470f5e7cb9 fib-ref=510d7042990844760d97d65c7e6c7ab75f934630
num-100 1.99 s (✅ 1.00x) 1.99 s (✅ 1.00x slower)
num-200 4.49 s (✅ 1.00x) 4.50 s (✅ 1.00x slower)

Made with criterion-table

Please sign in to comment.