From 0b8ea1d00f98be4d22619586d9b811808b416cc9 Mon Sep 17 00:00:00 2001 From: Andrew Lilley Brinker Date: Tue, 13 Feb 2024 23:37:44 +0000 Subject: [PATCH] feat: improved gitoid_from_buffer, misc. fixups This commit does a few things: - Refactors gitoid_from_buffer to delegate to BufReader. - Uses format_bytes::format_bytes instead of format to construct prefix for hashing. - Adds GitOid::from_reader_with_expected_length to support non-Seek-able readers and avoid cost of seeking if length is already known. - Adds GITOID_URL_SCHEME constant to replace two uses of literal string and ensure they're always consistent. - Updates FFI to support new from_reader_with_expected_length - Updates FFI to avoid double-buffering when building a GitOid from a reader, since the underlying implementation buffers. - Adds FFI support for new from_reader_with_expected_length constructor. - Renames FFI constructor from _new_from_* to _from_*, and updates C test code to use the new prefix. --- gitoid/Cargo.toml | 1 + gitoid/src/ffi/gitoid.rs | 69 ++++++++++++---- gitoid/src/gitoid.rs | 152 +++++++++++++++++++++++++---------- gitoid/src/hash_algorithm.rs | 4 - gitoid/src/lib.rs | 32 ++++---- gitoid/test/c/test.c | 26 +++--- 6 files changed, 193 insertions(+), 91 deletions(-) diff --git a/gitoid/Cargo.toml b/gitoid/Cargo.toml index 95ac2c4..0599ff0 100644 --- a/gitoid/Cargo.toml +++ b/gitoid/Cargo.toml @@ -16,6 +16,7 @@ crate-type = ["rlib", "cdylib"] [dependencies] # Match the version used in sha1 and sha2. digest = "0.10.7" +format-bytes = "0.3.0" # Match the version used in sha1, sha2, and digest. generic-array = "0.14.7" hex = { version = "0.4.3", default-features = false, features = ["std"] } diff --git a/gitoid/src/ffi/gitoid.rs b/gitoid/src/ffi/gitoid.rs index 450da5c..37656b7 100644 --- a/gitoid/src/ffi/gitoid.rs +++ b/gitoid/src/ffi/gitoid.rs @@ -25,7 +25,6 @@ use sha1collisiondetection::Sha1CD as Sha1Cd; use sha2::Sha256; use std::ffi::CString; use std::fs::File; -use std::io::BufReader; #[cfg(target_family = "unix")] use std::os::unix::prelude::FromRawFd; #[cfg(target_family = "unix")] @@ -120,7 +119,7 @@ macro_rules! generate_gitoid_ffi_for_hash { /// `content` must not be null, and the length of the buffer must match the /// length in bytes passed by `content_len`. #[no_mangle] - pub unsafe extern "C" fn []( + pub unsafe extern "C" fn []( content: *mut u8, content_len: usize, ) -> *const [] { @@ -143,7 +142,7 @@ macro_rules! generate_gitoid_ffi_for_hash { /// end, all contained in a single contiguous allocation. The pointer must also /// not be null. #[no_mangle] - pub unsafe extern "C" fn []( + pub unsafe extern "C" fn []( s: *const c_char, ) -> *const [] { let output = catch_panic(|| { @@ -165,7 +164,7 @@ macro_rules! generate_gitoid_ffi_for_hash { /// /// The returned `GitOid` must be freed. #[no_mangle] - pub unsafe extern "C" fn []( + pub unsafe extern "C" fn []( s: *const c_char ) -> *const [] { let output = catch_panic(|| { @@ -189,20 +188,36 @@ macro_rules! generate_gitoid_ffi_for_hash { /// Returns an invalid `GitOid` if construction fails. #[cfg(target_family = "unix")] #[no_mangle] - pub unsafe extern "C" fn []( - fd: RawFd, - should_buffer: bool, + pub unsafe extern "C" fn []( + fd: RawFd ) -> *const [] { let output = catch_panic(|| { let file = unsafe { File::from_raw_fd(fd) }; + let gitoid = GitOid::<$hash_ty, $object_ty>::from_reader(file)?; + let boxed = Box::new(gitoid); + Ok(Box::into_raw(boxed) as *const _) + }); - let gitoid = if should_buffer { - let reader = BufReader::new(file); - GitOid::<$hash_ty, $object_ty>::from_reader(reader)? - } else { - GitOid::<$hash_ty, $object_ty>::from_reader(file)? - }; + output.unwrap_or_else(null) + } + /// Create a new `GitOid` by reading data from a file. + /// + /// # Safety + /// + /// The provided file descriptor must be valid and open for reading. + /// + /// Returns an invalid `GitOid` if construction fails. + #[cfg(target_family = "unix")] + #[no_mangle] + pub unsafe extern "C" fn []( + fd: RawFd, + expected_length: c_int, + ) -> *const [] { + let output = catch_panic(|| { + let file = unsafe { File::from_raw_fd(fd) }; + let expected_length = expected_length as usize; + let gitoid = GitOid::<$hash_ty, $object_ty>::from_reader_with_expected_length(file, expected_length)?; let boxed = Box::new(gitoid); Ok(Box::into_raw(boxed) as *const _) }); @@ -220,7 +235,7 @@ macro_rules! generate_gitoid_ffi_for_hash { #[cfg(target_family = "windows")] #[no_mangle] /// cbindgen:ignore - pub unsafe extern "C" fn []( + pub unsafe extern "C" fn []( handle: RawHandle, ) -> *const [] { let output = catch_panic(|| { @@ -234,6 +249,32 @@ macro_rules! generate_gitoid_ffi_for_hash { output.unwrap_or_else(null) } + /// Create a new `GitOid` by reading data from a file. + /// + /// # Safety + /// + /// The provided file handle must be valid and open for reading. + /// + /// Returns an invalid `GitOid` if construction fails. + #[cfg(target_family = "windows")] + #[no_mangle] + /// cbindgen:ignore + pub unsafe extern "C" fn []( + handle: RawHandle, + expected_length: c_int, + ) -> *const [] { + let output = catch_panic(|| { + let file = unsafe { File::from_raw_handle(handle) }; + let expected_length = expected_length as usize; + let reader = BufReader::new(file); + let gitoid = GitOid::<$hash_ty, $object_ty>::new_from_reader_with_expected_length(reader, expected_length)?; + let boxed = Box::new(gitoid); + Ok(Box::into_raw(boxed) as *const _) + }); + + output.unwrap_or_else(null) + } + /// Construct a URL representation of a `GitOid`. /// /// # Safety diff --git a/gitoid/src/gitoid.rs b/gitoid/src/gitoid.rs index fa9b54c..0868609 100644 --- a/gitoid/src/gitoid.rs +++ b/gitoid/src/gitoid.rs @@ -9,6 +9,9 @@ use core::hash::Hash; use core::marker::PhantomData; use core::ops::Not as _; use digest::OutputSizeUser; +use format_bytes::format_bytes; +use format_bytes::write_bytes; +use format_bytes::DisplayBytes; use generic_array::sequence::GenericSequence; use generic_array::ArrayLength; use generic_array::GenericArray; @@ -16,13 +19,16 @@ use std::cmp::Ordering; use std::fmt::Debug; use std::fmt::Display; use std::fmt::Result as FmtResult; +use std::hash::Hasher; +use std::io::BufRead; +use std::io::BufReader; use std::io::Read; +use std::io::Result as IoResult; use std::io::Seek; use std::io::SeekFrom; +use std::io::Write; use std::str::FromStr; use std::str::Split; -use std::hash::Hasher; -use std::io::BufReader; use url::Url; /// A struct that computes [gitoids][g] based on the selected algorithm @@ -43,6 +49,8 @@ where value: GenericArray, } +const GITOID_URL_SCHEME: &str = "gitoid"; + impl GitOid where H: HashAlgorithm, @@ -55,6 +63,8 @@ where //------------------------------------------------------------------------------------------- /// Helper constructor for building a [`GitOid`] from a parsed hash. + /// + /// Use this so you don't have to care about filling in the phantom data. fn from_hash(value: GenericArray) -> GitOid { GitOid { _phantom: PhantomData, @@ -71,12 +81,8 @@ where ::OutputSize: ArrayLength, GenericArray: Copy, { - let digester = H::new(); - let reader = BufReader::new(content); - let expected_length = content.len(); - // PANIC SAFETY: We're reading from an in-memory buffer, so no IO errors can arise. - gitoid_from_buffer(digester, reader, expected_length).unwrap() + gitoid_from_buffer(H::new(), content, content.len()).unwrap() } inner(content.as_ref()) @@ -99,17 +105,22 @@ where } /// Create a `GitOid` from a reader. - pub fn from_reader( - mut reader: R - ) -> Result> { - let digester = H::new(); + pub fn from_reader(mut reader: R) -> Result> { let expected_length = stream_len(&mut reader)? as usize; - gitoid_from_buffer(digester, reader, expected_length) + GitOid::from_reader_with_expected_length(reader, expected_length) + } + + /// Generate a `GitOid` from a reader, providing an expected length in bytes. + pub fn from_reader_with_expected_length( + reader: R, + expected_length: usize, + ) -> Result> { + gitoid_from_buffer(H::new(), reader, expected_length) } /// Construct a new `GitOid` from a `Url`. pub fn from_url(url: Url) -> Result> { - url.try_into() + GitOid::try_from(url) } //=========================================================================================== @@ -118,9 +129,15 @@ where /// Get a URL for the current `GitOid`. pub fn url(&self) -> Url { - let s = format!("gitoid:{}:{}:{}", O::NAME, H::NAME, self.as_hex()); // PANIC SAFETY: We know that this is a valid URL. - Url::parse(&s).unwrap() + Url::parse(&format!( + "{}:{}:{}:{}", + GITOID_URL_SCHEME, + O::NAME, + H::NAME, + self.as_hex() + )) + .unwrap() } /// Get the underlying bytes of the hash. @@ -317,7 +334,7 @@ where } fn validate_url_scheme(&self) -> Result<()> { - if self.url.scheme() != "gitoid" { + if self.url.scheme() != GITOID_URL_SCHEME { return Err(Error::InvalidScheme(self.url.clone())); } @@ -406,9 +423,9 @@ where /// `expected_length`. If the actual bytes hashed differs, then something went /// wrong and the hash is not valid. fn gitoid_from_buffer( - mut digester: H, - mut reader: R, - expected_length: usize, + digester: H, + reader: R, + expected_read_length: usize, ) -> Result> where H: HashAlgorithm, @@ -417,38 +434,19 @@ where GenericArray: Copy, R: Read, { - let prefix = format!("{} {}\0", O::NAME, expected_length); - - // Linux default page size is 4096, so use that. - let mut buf = [0; 4096]; - let mut amount_read: usize = 0; - - digester.update(prefix.as_bytes()); + let expected_hash_length = ::output_size(); + let (hash, amount_read) = hash_from_buffer::(digester, reader, expected_read_length)?; - loop { - match reader.read(&mut buf)? { - 0 => break, - - size => { - digester.update(&buf[..size]); - amount_read += size; - } - } - } - - if amount_read != expected_length { + if amount_read != expected_read_length { return Err(Error::BadLength { - expected: expected_length, + expected: expected_read_length, actual: amount_read, }); } - let hash = digester.finalize(); - let expected_size = ::output_size(); - - if hash.len() != expected_size { + if hash.len() != expected_hash_length { return Err(Error::UnexpectedHashLength { - expected: expected_size, + expected: expected_hash_length, observed: hash.len(), }); } @@ -456,6 +454,72 @@ where Ok(GitOid::from_hash(hash)) } +// Helper struct for using `format_bytes` during prefix construction for hashing. +struct Str<'s>(&'s str); + +impl<'s> DisplayBytes for Str<'s> { + fn display_bytes(&self, output: &mut dyn Write) -> IoResult<()> { + write_bytes!(output, b"{}", self.0.as_bytes())?; + Ok(()) + } +} + +// Helper extension trait to give a convenient way to iterate over +// chunks sized to the size of the internal buffer of the reader. +trait ForEachChunk: BufRead { + // Takes a function to apply to each chunk, and returns if any + // errors arose along with the number of bytes read in total. + fn for_each_chunk(&mut self, f: impl FnMut(&[u8])) -> Result; +} + +impl ForEachChunk for R { + fn for_each_chunk(&mut self, mut f: impl FnMut(&[u8])) -> Result { + let mut total_read = 0; + + loop { + let buffer = self.fill_buf()?; + let amount_read = buffer.len(); + + if amount_read == 0 { + break; + } + + f(buffer); + + self.consume(amount_read); + total_read += amount_read; + } + + Ok(total_read) + } +} + +/// Helper function which actually applies the [`GitOid`] construction rules. +/// +/// This function handles actually constructing the hash with the GitOID prefix, +/// and delegates to a buffered reader for performance of the chunked reading. +fn hash_from_buffer( + mut digester: H, + reader: R, + expected_read_length: usize, +) -> Result<(GenericArray, usize)> +where + H: HashAlgorithm, + O: ObjectType, + ::OutputSize: ArrayLength, + GenericArray: Copy, + R: Read, +{ + digester.update(format_bytes!( + b"{} {}\0", + Str(O::NAME), + expected_read_length + )); + let amount_read = BufReader::new(reader).for_each_chunk(|b| digester.update(b))?; + let hash = digester.finalize(); + Ok((hash, amount_read)) +} + // Adapted from the Rust standard library's unstable implementation // of `Seek::stream_len`. // diff --git a/gitoid/src/hash_algorithm.rs b/gitoid/src/hash_algorithm.rs index c2e5e31..ea9baab 100644 --- a/gitoid/src/hash_algorithm.rs +++ b/gitoid/src/hash_algorithm.rs @@ -8,10 +8,6 @@ use sha1::Sha1; use sha1collisiondetection::Sha1CD as Sha1Cd; use sha2::Sha256; -mod private { - pub trait Sealed {} -} - /// Hash algorithms that can be used to make a [`GitOid`]. /// /// This is a sealed trait to ensure it's only used for hash diff --git a/gitoid/src/lib.rs b/gitoid/src/lib.rs index 06823f0..1a87025 100644 --- a/gitoid/src/lib.rs +++ b/gitoid/src/lib.rs @@ -1,7 +1,7 @@ //! A content-addressable identity for software artifacts. //! //! ## What are GitOIDs? -//! +//! //! Git Object Identifiers ([GitOIDs][gitoid]) are a mechanism for //! identifying artifacts in a manner which is independently reproducible //! because it relies only on the contents of the artifact itself. @@ -9,13 +9,13 @@ //! The GitOID scheme comes from the Git version control system, which uses //! this mechanism to identify commits, tags, files (called "blobs"), and //! directories (called "trees"). -//! +//! //! This implementation of GitOIDs is produced by the [OmniBOR][omnibor] //! working group, which uses GitOIDs as the basis for OmniBOR Artifact //! Identifiers. //! //! ### GitOID URL Scheme -//! +//! //! `gitoid` is also an IANA-registered URL scheme, meaning that GitOIDs //! are represented and shared as URLs. A `gitoid` URL looks like: //! @@ -29,14 +29,14 @@ //! separated by a colon. //! //! ### GitOID Hash Construction -//! +//! //! GitOID hashes are made by hashing a prefix string containing the object //! type and the size of the object being hashed in bytes, followed by a null //! terminator, and then hashing the object itself. So GitOID hashes do not //! match the result of only hashing the object. -//! +//! //! ### GitOID Object Types -//! +//! //! The valid object types for a GitOID are: //! //! - `blob` @@ -47,9 +47,9 @@ //! Currently, this crate implements convenient handling of `blob` objects, //! but does not handle ensuring the proper formatting of `tree`, `commit`, //! or `tag` objects to match the Git implementation. -//! +//! //! ### GitOID Hash Algorithms -//! +//! //! The valid hash algorithms are: //! //! - `sha1` @@ -61,37 +61,37 @@ //! believes to be an attempt to generate a purposeful SHA-1 collision, //! in which case it modifies the hash process to produce a different output //! and avoid the malicious collision. -//! +//! //! Git does this under the hood, but does not clearly distinguish to end //! users that the underlying hashing algorithm isn't equivalent to SHA-1. //! This is fine for Git, where the specific hash used is an implementation //! detail and only matters within a single repository, but for the OmniBOR //! working group it's important to distinguish whether plain SHA-1 or //! SHA-1DC is being used, so it's distinguished in the code for this crate. -//! +//! //! This means for compatibility with Git that SHA-1DC should be used. -//! +//! //! ## Why Care About GitOIDs? -//! +//! //! GitOIDs provide a convenient mechanism to establish artifact identity and //! validate artifact integrity (this artifact hasn't been modified) and //! agreement (I have the same artifact you have). The fact that they're based //! only on the type of object ("`blob`", usually) and the artifact itself //! means they can be derived independently, enabling distributed artifact //! identification that avoids a central decider. -//! +//! //! Alternative identity schemes, like Package URLs (purls) or Common Platform //! Enumerations (CPEs) rely on central authorities to produce identifiers or //! define the taxonomy in which identifiers are produced. -//! +//! //! ## Using this Crate -//! +//! //! The central type of this crate is [`GitOid`], which is generic over both //! the hash algorithm used and the object type being identified. The //! [`HashAlgorithm`] trait, which is sealed, is implemented by the types //! found in `gitoid::hash`. The [`ObjectType`] trait, which is also sealed, //! is implemented by the types found in `gitoid::object`. -//! +//! //! [gitoid]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects //! [omnibor]: https://omnibor.io diff --git a/gitoid/test/c/test.c b/gitoid/test/c/test.c index 8cd29f1..3310fbf 100644 --- a/gitoid/test/c/test.c +++ b/gitoid/test/c/test.c @@ -7,22 +7,22 @@ #define LEN(arr) (sizeof(arr) / sizeof(arr[0])); #define TEST(NAME) {.name = #NAME, .fn = NAME} -void test_gitoid_new_from_str() { - const GitOidSha1Blob* gitoid = gitoid_sha1_blob_new_from_str("hello world"); +void test_gitoid_from_str() { + const GitOidSha1Blob* gitoid = gitoid_sha1_blob_from_str("hello world"); assert(gitoid != NULL); assert(gitoid_sha1_blob_hash_len() == 20); assert(gitoid_sha1_blob_get_hash_bytes(gitoid)[0] == 149); gitoid_sha1_blob_free(gitoid); } -void test_gitoid_new_from_bytes() { +void test_gitoid_from_bytes() { unsigned char bytes[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; uint8_t bytes_len = LEN(bytes); - const GitOidSha1Blob* gitoid = gitoid_sha1_blob_new_from_bytes( + const GitOidSha1Blob* gitoid = gitoid_sha1_blob_from_bytes( bytes, bytes_len ); @@ -33,9 +33,9 @@ void test_gitoid_new_from_bytes() { gitoid_sha1_blob_free(gitoid); } -void test_gitoid_new_from_url() { +void test_gitoid_from_url() { char *url = "gitoid:blob:sha256:fee53a18d32820613c0527aa79be5cb30173c823a9b448fa4817767cc84c6f03"; - const GitOidSha256Blob* gitoid = gitoid_sha256_blob_new_from_url(url); + const GitOidSha256Blob* gitoid = gitoid_sha256_blob_from_url(url); assert(gitoid != NULL); assert(gitoid_sha256_blob_hash_len() == 32); assert(gitoid_sha256_blob_get_hash_bytes(gitoid)[0] == 254); @@ -44,7 +44,7 @@ void test_gitoid_new_from_url() { void test_gitoid_get_url() { char *url_in = "gitoid:blob:sha256:fee53a18d32820613c0527aa79be5cb30173c823a9b448fa4817767cc84c6f03"; - const GitOidSha256Blob* gitoid = gitoid_sha256_blob_new_from_url(url_in); + const GitOidSha256Blob* gitoid = gitoid_sha256_blob_from_url(url_in); assert(gitoid != NULL); const char *url_out = gitoid_sha256_blob_get_url(gitoid); assert(strncmp(url_in, url_out, 83) == 0); @@ -53,7 +53,7 @@ void test_gitoid_get_url() { } void test_gitoid_hash_algorithm_name() { - const GitOidSha1Blob* gitoid = gitoid_sha1_blob_new_from_str("hello world"); + const GitOidSha1Blob* gitoid = gitoid_sha1_blob_from_str("hello world"); assert(gitoid != NULL); const char *hash_algorithm = gitoid_sha1_blob_hash_algorithm_name(gitoid); assert(strncmp(hash_algorithm, "sha1", 4) == 0); @@ -61,7 +61,7 @@ void test_gitoid_hash_algorithm_name() { } void test_gitoid_object_type_name() { - const GitOidSha1Blob* gitoid = gitoid_sha1_blob_new_from_str("hello world"); + const GitOidSha1Blob* gitoid = gitoid_sha1_blob_from_str("hello world"); assert(gitoid != NULL); const char *object_type = gitoid_sha1_blob_object_type_name(gitoid); assert(strncmp(object_type, "blob", 4) == 0); @@ -70,7 +70,7 @@ void test_gitoid_object_type_name() { void test_gitoid_validity() { char *validity_url = "gitoid:blob:sha000:fee53a18d32820613c0527aa79be5cb30173c823a9b448fa4817767cc84c6f03"; - const GitOidSha1Blob* gitoid = gitoid_sha1_blob_new_from_url(validity_url); + const GitOidSha1Blob* gitoid = gitoid_sha1_blob_from_url(validity_url); assert(gitoid == NULL); char *expected_msg = "string is not a valid GitOID URL"; @@ -90,9 +90,9 @@ int main() { setvbuf(stdout, NULL, _IONBF, 0); test_t tests[] = { - TEST(test_gitoid_new_from_str), - TEST(test_gitoid_new_from_bytes), - TEST(test_gitoid_new_from_url), + TEST(test_gitoid_from_str), + TEST(test_gitoid_from_bytes), + TEST(test_gitoid_from_url), TEST(test_gitoid_get_url), TEST(test_gitoid_hash_algorithm_name), TEST(test_gitoid_object_type_name),