Skip to content

Commit

Permalink
feat: improved gitoid_from_buffer, misc. fixups
Browse files Browse the repository at this point in the history
This commit does a few things:

- Refactors gitoid_from_buffer to delegate to BufReader.
- Uses format_bytes::format_bytes instead of format to construct
  prefix for hashing.
- Adds GitOid::from_reader_with_expected_length to support
  non-Seek-able readers and avoid cost of seeking if length is
  already known.
- Adds GITOID_URL_SCHEME constant to replace two uses of
  literal string and ensure they're always consistent.
- Updates FFI to support new from_reader_with_expected_length
- Updates FFI to avoid double-buffering when building a GitOid
  from a reader, since the underlying implementation buffers.
- Adds FFI support for new from_reader_with_expected_length
  constructor.
- Renames FFI constructor from _new_from_* to _from_*, and
  updates C test code to use the new prefix.
  • Loading branch information
alilleybrinker committed Feb 13, 2024
1 parent da66cf6 commit 0b8ea1d
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 91 deletions.
1 change: 1 addition & 0 deletions gitoid/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ crate-type = ["rlib", "cdylib"]
[dependencies]
# Match the version used in sha1 and sha2.
digest = "0.10.7"
format-bytes = "0.3.0"
# Match the version used in sha1, sha2, and digest.
generic-array = "0.14.7"
hex = { version = "0.4.3", default-features = false, features = ["std"] }
Expand Down
69 changes: 55 additions & 14 deletions gitoid/src/ffi/gitoid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ use sha1collisiondetection::Sha1CD as Sha1Cd;
use sha2::Sha256;
use std::ffi::CString;
use std::fs::File;
use std::io::BufReader;
#[cfg(target_family = "unix")]
use std::os::unix::prelude::FromRawFd;
#[cfg(target_family = "unix")]
Expand Down Expand Up @@ -120,7 +119,7 @@ macro_rules! generate_gitoid_ffi_for_hash {
/// `content` must not be null, and the length of the buffer must match the
/// length in bytes passed by `content_len`.
#[no_mangle]
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _new_from_bytes>](
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_bytes>](
content: *mut u8,
content_len: usize,
) -> *const [<GitOid $hash_ty $object_ty>] {
Expand All @@ -143,7 +142,7 @@ macro_rules! generate_gitoid_ffi_for_hash {
/// end, all contained in a single contiguous allocation. The pointer must also
/// not be null.
#[no_mangle]
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _new_from_str>](
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_str>](
s: *const c_char,
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
Expand All @@ -165,7 +164,7 @@ macro_rules! generate_gitoid_ffi_for_hash {
///
/// The returned `GitOid` must be freed.
#[no_mangle]
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _new_from_url>](
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_url>](
s: *const c_char
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
Expand All @@ -189,20 +188,36 @@ macro_rules! generate_gitoid_ffi_for_hash {
/// Returns an invalid `GitOid` if construction fails.
#[cfg(target_family = "unix")]
#[no_mangle]
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _new_from_reader>](
fd: RawFd,
should_buffer: bool,
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_reader>](
fd: RawFd
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
let file = unsafe { File::from_raw_fd(fd) };
let gitoid = GitOid::<$hash_ty, $object_ty>::from_reader(file)?;
let boxed = Box::new(gitoid);
Ok(Box::into_raw(boxed) as *const _)
});

let gitoid = if should_buffer {
let reader = BufReader::new(file);
GitOid::<$hash_ty, $object_ty>::from_reader(reader)?
} else {
GitOid::<$hash_ty, $object_ty>::from_reader(file)?
};
output.unwrap_or_else(null)
}

/// Create a new `GitOid` by reading data from a file.
///
/// # Safety
///
/// The provided file descriptor must be valid and open for reading.
///
/// Returns an invalid `GitOid` if construction fails.
#[cfg(target_family = "unix")]
#[no_mangle]
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_reader_with_expected_length>](
fd: RawFd,
expected_length: c_int,
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
let file = unsafe { File::from_raw_fd(fd) };
let expected_length = expected_length as usize;
let gitoid = GitOid::<$hash_ty, $object_ty>::from_reader_with_expected_length(file, expected_length)?;
let boxed = Box::new(gitoid);
Ok(Box::into_raw(boxed) as *const _)
});
Expand All @@ -220,7 +235,7 @@ macro_rules! generate_gitoid_ffi_for_hash {
#[cfg(target_family = "windows")]
#[no_mangle]
/// cbindgen:ignore
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _new_from_reader>](
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_reader>](
handle: RawHandle,
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
Expand All @@ -234,6 +249,32 @@ macro_rules! generate_gitoid_ffi_for_hash {
output.unwrap_or_else(null)
}

/// Create a new `GitOid` by reading data from a file.
///
/// # Safety
///
/// The provided file handle must be valid and open for reading.
///
/// Returns an invalid `GitOid` if construction fails.
#[cfg(target_family = "windows")]
#[no_mangle]
/// cbindgen:ignore
pub unsafe extern "C" fn [<gitoid_ $hash_name _ $object_name _from_reader_with_expected_length>](
handle: RawHandle,
expected_length: c_int,
) -> *const [<GitOid $hash_ty $object_ty>] {
let output = catch_panic(|| {
let file = unsafe { File::from_raw_handle(handle) };
let expected_length = expected_length as usize;
let reader = BufReader::new(file);
let gitoid = GitOid::<$hash_ty, $object_ty>::new_from_reader_with_expected_length(reader, expected_length)?;
let boxed = Box::new(gitoid);
Ok(Box::into_raw(boxed) as *const _)
});

output.unwrap_or_else(null)
}

/// Construct a URL representation of a `GitOid`.
///
/// # Safety
Expand Down
152 changes: 108 additions & 44 deletions gitoid/src/gitoid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,26 @@ use core::hash::Hash;
use core::marker::PhantomData;
use core::ops::Not as _;
use digest::OutputSizeUser;
use format_bytes::format_bytes;
use format_bytes::write_bytes;
use format_bytes::DisplayBytes;
use generic_array::sequence::GenericSequence;
use generic_array::ArrayLength;
use generic_array::GenericArray;
use std::cmp::Ordering;
use std::fmt::Debug;
use std::fmt::Display;
use std::fmt::Result as FmtResult;
use std::hash::Hasher;
use std::io::BufRead;
use std::io::BufReader;
use std::io::Read;
use std::io::Result as IoResult;
use std::io::Seek;
use std::io::SeekFrom;
use std::io::Write;
use std::str::FromStr;
use std::str::Split;
use std::hash::Hasher;
use std::io::BufReader;
use url::Url;

/// A struct that computes [gitoids][g] based on the selected algorithm
Expand All @@ -43,6 +49,8 @@ where
value: GenericArray<u8, H::OutputSize>,
}

const GITOID_URL_SCHEME: &str = "gitoid";

impl<H, O> GitOid<H, O>
where
H: HashAlgorithm,
Expand All @@ -55,6 +63,8 @@ where
//-------------------------------------------------------------------------------------------

/// Helper constructor for building a [`GitOid`] from a parsed hash.
///
/// Use this so you don't have to care about filling in the phantom data.
fn from_hash(value: GenericArray<u8, H::OutputSize>) -> GitOid<H, O> {
GitOid {
_phantom: PhantomData,
Expand All @@ -71,12 +81,8 @@ where
<H as OutputSizeUser>::OutputSize: ArrayLength<u8>,
GenericArray<u8, H::OutputSize>: Copy,
{
let digester = H::new();
let reader = BufReader::new(content);
let expected_length = content.len();

// PANIC SAFETY: We're reading from an in-memory buffer, so no IO errors can arise.
gitoid_from_buffer(digester, reader, expected_length).unwrap()
gitoid_from_buffer(H::new(), content, content.len()).unwrap()
}

inner(content.as_ref())
Expand All @@ -99,17 +105,22 @@ where
}

/// Create a `GitOid` from a reader.
pub fn from_reader<R: Read + Seek>(
mut reader: R
) -> Result<GitOid<H, O>> {
let digester = H::new();
pub fn from_reader<R: Read + Seek>(mut reader: R) -> Result<GitOid<H, O>> {
let expected_length = stream_len(&mut reader)? as usize;
gitoid_from_buffer(digester, reader, expected_length)
GitOid::from_reader_with_expected_length(reader, expected_length)
}

/// Generate a `GitOid` from a reader, providing an expected length in bytes.
pub fn from_reader_with_expected_length<R: Read>(
reader: R,
expected_length: usize,
) -> Result<GitOid<H, O>> {
gitoid_from_buffer(H::new(), reader, expected_length)
}

/// Construct a new `GitOid` from a `Url`.
pub fn from_url(url: Url) -> Result<GitOid<H, O>> {
url.try_into()
GitOid::try_from(url)
}

//===========================================================================================
Expand All @@ -118,9 +129,15 @@ where

/// Get a URL for the current `GitOid`.
pub fn url(&self) -> Url {
let s = format!("gitoid:{}:{}:{}", O::NAME, H::NAME, self.as_hex());
// PANIC SAFETY: We know that this is a valid URL.
Url::parse(&s).unwrap()
Url::parse(&format!(
"{}:{}:{}:{}",
GITOID_URL_SCHEME,
O::NAME,
H::NAME,
self.as_hex()
))
.unwrap()
}

/// Get the underlying bytes of the hash.
Expand Down Expand Up @@ -317,7 +334,7 @@ where
}

fn validate_url_scheme(&self) -> Result<()> {
if self.url.scheme() != "gitoid" {
if self.url.scheme() != GITOID_URL_SCHEME {
return Err(Error::InvalidScheme(self.url.clone()));
}

Expand Down Expand Up @@ -406,9 +423,9 @@ where
/// `expected_length`. If the actual bytes hashed differs, then something went
/// wrong and the hash is not valid.
fn gitoid_from_buffer<H, O, R>(
mut digester: H,
mut reader: R,
expected_length: usize,
digester: H,
reader: R,
expected_read_length: usize,
) -> Result<GitOid<H, O>>
where
H: HashAlgorithm,
Expand All @@ -417,45 +434,92 @@ where
GenericArray<u8, H::OutputSize>: Copy,
R: Read,
{
let prefix = format!("{} {}\0", O::NAME, expected_length);

// Linux default page size is 4096, so use that.
let mut buf = [0; 4096];
let mut amount_read: usize = 0;

digester.update(prefix.as_bytes());
let expected_hash_length = <H as OutputSizeUser>::output_size();
let (hash, amount_read) = hash_from_buffer::<H, O, R>(digester, reader, expected_read_length)?;

loop {
match reader.read(&mut buf)? {
0 => break,

size => {
digester.update(&buf[..size]);
amount_read += size;
}
}
}

if amount_read != expected_length {
if amount_read != expected_read_length {
return Err(Error::BadLength {
expected: expected_length,
expected: expected_read_length,
actual: amount_read,
});
}

let hash = digester.finalize();
let expected_size = <H as OutputSizeUser>::output_size();

if hash.len() != expected_size {
if hash.len() != expected_hash_length {
return Err(Error::UnexpectedHashLength {
expected: expected_size,
expected: expected_hash_length,
observed: hash.len(),
});
}

Ok(GitOid::from_hash(hash))
}

// Helper struct for using `format_bytes` during prefix construction for hashing.
struct Str<'s>(&'s str);

impl<'s> DisplayBytes for Str<'s> {
fn display_bytes(&self, output: &mut dyn Write) -> IoResult<()> {
write_bytes!(output, b"{}", self.0.as_bytes())?;
Ok(())
}
}

// Helper extension trait to give a convenient way to iterate over
// chunks sized to the size of the internal buffer of the reader.
trait ForEachChunk: BufRead {
// Takes a function to apply to each chunk, and returns if any
// errors arose along with the number of bytes read in total.
fn for_each_chunk(&mut self, f: impl FnMut(&[u8])) -> Result<usize>;
}

impl<R: BufRead> ForEachChunk for R {
fn for_each_chunk(&mut self, mut f: impl FnMut(&[u8])) -> Result<usize> {
let mut total_read = 0;

loop {
let buffer = self.fill_buf()?;
let amount_read = buffer.len();

if amount_read == 0 {
break;
}

f(buffer);

self.consume(amount_read);
total_read += amount_read;
}

Ok(total_read)
}
}

/// Helper function which actually applies the [`GitOid`] construction rules.
///
/// This function handles actually constructing the hash with the GitOID prefix,
/// and delegates to a buffered reader for performance of the chunked reading.
fn hash_from_buffer<H, O, R>(
mut digester: H,
reader: R,
expected_read_length: usize,
) -> Result<(GenericArray<u8, H::OutputSize>, usize)>
where
H: HashAlgorithm,
O: ObjectType,
<H as OutputSizeUser>::OutputSize: ArrayLength<u8>,
GenericArray<u8, H::OutputSize>: Copy,
R: Read,
{
digester.update(format_bytes!(
b"{} {}\0",
Str(O::NAME),
expected_read_length
));
let amount_read = BufReader::new(reader).for_each_chunk(|b| digester.update(b))?;
let hash = digester.finalize();
Ok((hash, amount_read))
}

// Adapted from the Rust standard library's unstable implementation
// of `Seek::stream_len`.
//
Expand Down
4 changes: 0 additions & 4 deletions gitoid/src/hash_algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ use sha1::Sha1;
use sha1collisiondetection::Sha1CD as Sha1Cd;
use sha2::Sha256;

mod private {
pub trait Sealed {}
}

/// Hash algorithms that can be used to make a [`GitOid`].
///
/// This is a sealed trait to ensure it's only used for hash
Expand Down
Loading

0 comments on commit 0b8ea1d

Please sign in to comment.