Skip to content

Commit

Permalink
bugfix, comment fix, force compile fails for big-endian
Browse files Browse the repository at this point in the history
  • Loading branch information
a10y committed Aug 15, 2024
1 parent 56e0ace commit ca0c7d8
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 83 deletions.
36 changes: 0 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ use_debug = { level = "deny" }

[dev-dependencies]
criterion = "0.5"
lz4 = "1"

[[example]]
name = "round_trip"
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ but it is mostly written from a careful reading of the paper.

**NOTE: This current implementation is still in-progress and is not production ready, please use at your own risk.**

**NOTE: This crate only works on little-endian architectures currently. There are no current plans to support big-endian targets.**

[whitepaper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
[MIT-licensed implementation]: https://github.com/cwida/fsst
40 changes: 1 addition & 39 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@
//! Also contains LZ4 baseline.
#![allow(missing_docs)]
use core::str;
use std::io::{Cursor, Read, Write};

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use lz4::liblz4::BlockChecksum;
use lz4::{BlockSize, ContentChecksum};

use fsst_rs::{train, ESCAPE_CODE};

Expand Down Expand Up @@ -48,40 +45,5 @@ fn bench_fsst(c: &mut Criterion) {
});
}

fn bench_lz4(c: &mut Criterion) {
let mut group = c.benchmark_group("lz4");

group.bench_function("compress-single", |b| {
let mut compressed = Vec::with_capacity(100_000_000);
let mut encoder = lz4::EncoderBuilder::new()
.block_size(BlockSize::Max64KB)
.checksum(ContentChecksum::NoChecksum)
.block_checksum(BlockChecksum::NoBlockChecksum)
.build(&mut compressed)
.unwrap();

b.iter(|| encoder.write_all(TEST.as_bytes()).unwrap());
});

group.bench_function("decompress-single", |b| {
let compressed = Vec::new();
let mut encoder = lz4::EncoderBuilder::new()
.block_size(BlockSize::Max64KB)
.checksum(ContentChecksum::NoChecksum)
.block_checksum(BlockChecksum::NoBlockChecksum)
.build(compressed)
.unwrap();
encoder.write_all(TEST.as_bytes()).unwrap();
let (compressed, result) = encoder.finish();
result.unwrap();

let cursor = Cursor::new(compressed);
let mut decoder = lz4::Decoder::new(cursor).unwrap();
let mut output = Vec::new();

b.iter(|| decoder.read_to_end(&mut output).unwrap());
});
}

criterion_group!(compress_bench, bench_fsst, bench_lz4);
criterion_group!(compress_bench, bench_fsst);
criterion_main!(compress_bench);
2 changes: 1 addition & 1 deletion src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl SymbolTable {
let symbol2 = &self.symbols[code2 as usize];
// If either symbol is zero-length, or if merging would yield a symbol of
// length greater than 8, skip.
if symbol1.len() + symbol2.len() >= 8 || symbol1.is_empty() || symbol2.is_empty() {
if symbol1.len() + symbol2.len() >= 8 {
continue;
}
let new_symbol = symbol1.concat(symbol2);
Expand Down
15 changes: 9 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#![doc = include_str!("../README.md")]
#![cfg(target_endian = "little")]

/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes.
macro_rules! assert_sizeof {
Expand Down Expand Up @@ -49,10 +50,12 @@ impl Symbol {
}

impl Symbol {
/// Calculate the length of the symbol in bytes.
/// Calculate the length of the symbol in bytes. Always a value between 1 and 8.
///
/// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols
/// can contain fewer bytes, padded with 0x00.
/// can contain fewer bytes, padded with 0x00. There is a special case of a symbol
/// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000`
/// but we want to interpret that as a one-byte symbol containing `0x00`.
pub fn len(&self) -> usize {
let numeric = unsafe { self.num };
// For little-endian platforms, this counts the number of *trailing* zeros
Expand Down Expand Up @@ -113,10 +116,10 @@ impl Symbol {

/// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
pub fn concat(&self, other: &Self) -> Self {
let new_len = self.len() + other.len();
let self_len = self.len();
let new_len = self_len + other.len();
assert!(new_len <= 8, "cannot build symbol with length > 8");

let self_len = self.len();
let mut result = *self;

// SAFETY: self_len and new_len are checked to be <= 8
Expand Down Expand Up @@ -421,13 +424,13 @@ impl SymbolTable {

/// Decompress a byte slice that was previously returned by [compression][Self::compress].
pub fn decompress(&self, compressed: &[u8]) -> Vec<u8> {
let mut decoded: Vec<u8> = Vec::with_capacity(size_of::<Symbol>() * compressed.len());
let mut decoded: Vec<u8> = Vec::with_capacity(size_of::<Symbol>() * (compressed.len() + 1));
let ptr = decoded.as_mut_ptr();

let mut in_pos = 0;
let mut out_pos = 0;

while in_pos < compressed.len() && out_pos < (decoded.capacity() + size_of::<Symbol>()) {
while in_pos < compressed.len() && out_pos < (decoded.capacity() - size_of::<Symbol>()) {
let code = compressed[in_pos];
if code == ESCAPE_CODE {
// Advance by one, do raw write.
Expand Down
14 changes: 14 additions & 0 deletions tests/correctness.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#![cfg(test)]

use fsst_rs::Symbol;

static PREAMBLE: &str = r#"
When in the Course of human events, it becomes necessary for one people to dissolve
the political bands which have connected them with another, and to assume among the
Expand Down Expand Up @@ -29,6 +31,18 @@ fn test_train_on_empty() {
);
}

#[test]
fn test_one_byte() {
let mut empty = fsst_rs::SymbolTable::default();
// Assign code 0 to map to the symbol containing byte 0x01
empty.insert(Symbol::from_u8(0x01));

let compressed = empty.compress(&[0x01]);
assert_eq!(compressed, vec![0u8]);

assert_eq!(empty.decompress(&compressed), vec![0x01]);
}

#[test]
fn test_zeros() {
println!("training zeros");
Expand Down

0 comments on commit ca0c7d8

Please sign in to comment.