diff --git a/src/bitmap/container.rs b/src/bitmap/container.rs index a28c7c67..3ab1e5ad 100644 --- a/src/bitmap/container.rs +++ b/src/bitmap/container.rs @@ -37,6 +37,10 @@ impl Container { self.store.len() } + pub fn is_empty(&self) -> bool { + self.store.is_empty() + } + pub fn insert(&mut self, index: u16) -> bool { if self.store.insert(index) { self.ensure_correct_store(); diff --git a/src/bitmap/inherent.rs b/src/bitmap/inherent.rs index 57c3ffec..559b7579 100644 --- a/src/bitmap/inherent.rs +++ b/src/bitmap/inherent.rs @@ -208,7 +208,7 @@ impl RoaringBitmap { match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(loc) => { if self.containers[loc].remove(index) { - if self.containers[loc].len() == 0 { + if self.containers[loc].is_empty() { self.containers.remove(loc); } true @@ -253,7 +253,7 @@ impl RoaringBitmap { let a = if key == start_container_key { start_index } else { 0 }; let b = if key == end_container_key { end_index } else { u16::MAX }; removed += self.containers[index].remove_range(a..=b); - if self.containers[index].len() == 0 { + if self.containers[index].is_empty() { self.containers.remove(index); continue; } diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 4aebfc37..ed5567d5 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -12,10 +12,12 @@ mod cmp; mod inherent; mod iter; mod ops; +#[cfg(feature = "std")] +mod ops_with_serialized; #[cfg(feature = "serde")] mod serde; #[cfg(feature = "std")] -mod serialization; +pub(crate) mod serialization; use self::cmp::Pairs; pub use self::iter::IntoIter; diff --git a/src/bitmap/multiops.rs b/src/bitmap/multiops.rs index 1982e4f3..66a4e085 100644 --- a/src/bitmap/multiops.rs +++ b/src/bitmap/multiops.rs @@ -232,7 +232,7 @@ fn try_multi_or_owned( } containers.retain_mut(|container| { - if container.len() > 0 { + if !container.is_empty() { container.ensure_correct_store(); true } else { @@ -258,7 +258,7 @@ fn try_multi_xor_owned( } containers.retain_mut(|container| { - if container.len() > 0 { + if !container.is_empty() { container.ensure_correct_store(); true } else { diff --git a/src/bitmap/ops.rs b/src/bitmap/ops.rs index f99a376b..4337dbdb 100644 --- a/src/bitmap/ops.rs +++ b/src/bitmap/ops.rs @@ -223,7 +223,7 @@ impl BitAnd<&RoaringBitmap> for &RoaringBitmap { for pair in Pairs::new(&self.containers, &rhs.containers) { if let (Some(lhs), Some(rhs)) = pair { let container = BitAnd::bitand(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -248,7 +248,7 @@ impl BitAndAssign for RoaringBitmap { let rhs_cont = &mut rhs.containers[loc]; let rhs_cont = mem::replace(rhs_cont, Container::new(rhs_cont.key)); BitAndAssign::bitand_assign(cont, rhs_cont); - cont.len() != 0 + !cont.is_empty() } Err(_) => false, } @@ -264,7 +264,7 @@ impl BitAndAssign<&RoaringBitmap> for RoaringBitmap { match rhs.containers.binary_search_by_key(&key, |c| c.key) { Ok(loc) => { BitAndAssign::bitand_assign(cont, &rhs.containers[loc]); - cont.len() != 0 + !cont.is_empty() } Err(_) => false, } @@ -314,7 +314,7 @@ impl Sub<&RoaringBitmap> for &RoaringBitmap { (None, Some(_)) => (), (Some(lhs), Some(rhs)) => { let container = Sub::sub(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -340,7 +340,7 @@ impl SubAssign<&RoaringBitmap> for RoaringBitmap { match rhs.containers.binary_search_by_key(&cont.key, |c| c.key) { Ok(loc) => { SubAssign::sub_assign(cont, &rhs.containers[loc]); - cont.len() != 0 + !cont.is_empty() } Err(_) => true, } @@ -390,7 +390,7 @@ impl BitXor<&RoaringBitmap> for &RoaringBitmap { (None, Some(rhs)) => containers.push(rhs.clone()), (Some(lhs), Some(rhs)) => { let container = BitXor::bitxor(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -409,7 +409,7 @@ impl BitXorAssign for RoaringBitmap { match pair { (Some(mut lhs), Some(rhs)) => { BitXorAssign::bitxor_assign(&mut lhs, rhs); - if lhs.len() != 0 { + if !lhs.is_empty() { self.containers.push(lhs); } } @@ -428,7 +428,7 @@ impl BitXorAssign<&RoaringBitmap> for RoaringBitmap { match pair { (Some(mut lhs), Some(rhs)) => { BitXorAssign::bitxor_assign(&mut lhs, rhs); - if lhs.len() != 0 { + if !lhs.is_empty() { self.containers.push(lhs); } } diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs new file mode 100644 index 00000000..e3808803 --- /dev/null +++ b/src/bitmap/ops_with_serialized.rs @@ -0,0 +1,300 @@ +use bytemuck::cast_slice_mut; +use byteorder::{LittleEndian, ReadBytesExt}; +use core::convert::Infallible; +use std::error::Error; +use std::io::{self, SeekFrom}; +use std::mem; +use std::ops::RangeInclusive; + +use crate::bitmap::container::Container; +use crate::bitmap::serialization::{ + NO_OFFSET_THRESHOLD, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, +}; +use crate::RoaringBitmap; + +use super::container::ARRAY_LIMIT; +use super::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; + +impl RoaringBitmap { + /// Computes the intersection between a materialized [`RoaringBitmap`] and a serialized one. + /// + /// This is faster and more space efficient when you only need the intersection result. + /// It reduces the number of deserialized internal container and therefore + /// the number of allocations and copies of bytes. + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// use std::io::Cursor; + /// + /// let rb1: RoaringBitmap = (1..4).collect(); + /// let rb2: RoaringBitmap = (3..5).collect(); + /// + /// // Let's say the rb2 bitmap is serialized + /// let mut bytes = Vec::new(); + /// rb2.serialize_into(&mut bytes).unwrap(); + /// let rb2_bytes = Cursor::new(bytes); + /// + /// assert_eq!( + /// rb1.intersection_with_serialized_unchecked(rb2_bytes).unwrap(), + /// rb1 & rb2, + /// ); + /// ``` + pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result + where + R: io::Read + io::Seek, + { + RoaringBitmap::intersection_with_serialized_impl::( + self, + other, + |values| Ok(ArrayStore::from_vec_unchecked(values)), + |len, values| Ok(BitmapStore::from_unchecked(len, values)), + ) + } + + fn intersection_with_serialized_impl( + &self, + mut reader: R, + a: A, + b: B, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + // First read the cookie to determine which version of the format we are reading + let (size, has_offsets, has_run_containers) = { + let cookie = reader.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (reader.read_u32::()? as usize, true, false) + } else if (cookie as u16) == SERIAL_COOKIE { + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + // Read the run container bitmap if necessary + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; (size + 7) / 8]; + reader.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + + if size > u16::MAX as usize + 1 { + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); + } + + // Read the container descriptions + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { + *key = u16::from_le(*key); + *len = u16::from_le(*len); + }); + + if has_offsets { + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + return self.intersection_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } + + // Read each container and skip the useless ones + let mut containers = Vec::new(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { + let container = match self.containers.binary_search_by_key(&key, |c| c.key) { + Ok(index) => self.containers.get(index), + Err(_) => None, + }; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::()?; + match container { + Some(_) => { + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each( + |[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + }, + )?; + store + } + None => { + let runs_size = mem::size_of::() * 2 * runs as usize; + reader.seek(SeekFrom::Current(runs_size as i64))?; + continue; + } + } + } else if cardinality <= ARRAY_LIMIT { + match container { + Some(_) => { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = + a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } + None => { + let array_size = mem::size_of::() * cardinality as usize; + reader.seek(SeekFrom::Current(array_size as i64))?; + continue; + } + } + } else { + match container { + Some(_) => { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + } + None => { + let bitmap_size = mem::size_of::() * BITMAP_LENGTH; + reader.seek(SeekFrom::Current(bitmap_size as i64))?; + continue; + } + } + }; + + if let Some(container) = container { + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + } + + Ok(RoaringBitmap { containers }) + } + + fn intersection_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + for container in &self.containers { + let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => index, + Err(_) => continue, + }; + + // Seek to the bytes of the container we want. + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let [key, len_minus_one] = descriptions[i]; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + + Ok(RoaringBitmap { containers }) + } +} + +#[cfg(test)] +mod test { + use crate::RoaringBitmap; + use proptest::prelude::*; + use std::io::Cursor; + + // fast count tests + proptest! { + #[test] + fn intersection_with_serialized_eq_materialized_intersection( + a in RoaringBitmap::arbitrary(), + b in RoaringBitmap::arbitrary() + ) { + let mut serialized_bytes_b = Vec::new(); + b.serialize_into(&mut serialized_bytes_b).unwrap(); + let serialized_bytes_b = &serialized_bytes_b[..]; + + prop_assert_eq!(a.intersection_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a & b); + } + } +} diff --git a/src/bitmap/serialization.rs b/src/bitmap/serialization.rs index 790e2d28..f04f5039 100644 --- a/src/bitmap/serialization.rs +++ b/src/bitmap/serialization.rs @@ -9,13 +9,13 @@ use crate::bitmap::container::{Container, ARRAY_LIMIT}; use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; use crate::RoaringBitmap; -const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; -const SERIAL_COOKIE: u16 = 12347; -const NO_OFFSET_THRESHOLD: usize = 4; +pub const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +pub const SERIAL_COOKIE: u16 = 12347; +pub const NO_OFFSET_THRESHOLD: usize = 4; // Sizes of header structures -const DESCRIPTION_BYTES: usize = 4; -const OFFSET_BYTES: usize = 4; +pub const DESCRIPTION_BYTES: usize = 4; +pub const OFFSET_BYTES: usize = 4; impl RoaringBitmap { /// Return the size in bytes of the serialized output. diff --git a/src/bitmap/store/array_store/mod.rs b/src/bitmap/store/array_store/mod.rs index 6c41aadb..883db31f 100644 --- a/src/bitmap/store/array_store/mod.rs +++ b/src/bitmap/store/array_store/mod.rs @@ -200,6 +200,10 @@ impl ArrayStore { self.vec.len() as u64 } + pub fn is_empty(&self) -> bool { + self.vec.is_empty() + } + pub fn min(&self) -> Option { self.vec.first().copied() } diff --git a/src/bitmap/store/bitmap_store.rs b/src/bitmap/store/bitmap_store.rs index 731fc929..f349a2aa 100644 --- a/src/bitmap/store/bitmap_store.rs +++ b/src/bitmap/store/bitmap_store.rs @@ -241,6 +241,10 @@ impl BitmapStore { self.len } + pub fn is_empty(&self) -> bool { + self.len == 0 + } + pub fn min(&self) -> Option { self.bits .iter() diff --git a/src/bitmap/store/mod.rs b/src/bitmap/store/mod.rs index 25426295..bb0d5822 100644 --- a/src/bitmap/store/mod.rs +++ b/src/bitmap/store/mod.rs @@ -177,6 +177,13 @@ impl Store { } } + pub fn is_empty(&self) -> bool { + match self { + Array(vec) => vec.is_empty(), + Bitmap(bits) => bits.is_empty(), + } + } + pub fn min(&self) -> Option { match self { Array(vec) => vec.min(),