From 9412a7b8be29480240130bb6735d903bdbf356c0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 9 Oct 2023 15:49:21 +0200 Subject: [PATCH] union with serialized bitmap --- src/bitmap/mod.rs | 1 + src/bitmap/ops_with_serialized.rs | 173 ++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 src/bitmap/ops_with_serialized.rs diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 9b34bdcd..c2c2ea8c 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -12,6 +12,7 @@ mod cmp; mod inherent; mod iter; mod ops; +mod ops_with_serialized; #[cfg(feature = "serde")] mod serde; mod serialization; diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs new file mode 100644 index 00000000..5be221b6 --- /dev/null +++ b/src/bitmap/ops_with_serialized.rs @@ -0,0 +1,173 @@ +use bytemuck::cast_slice_mut; +use byteorder::{LittleEndian, ReadBytesExt}; +use std::{ + io::{self, Read}, + mem::size_of, +}; + +use crate::RoaringBitmap; + +use super::{ + container::Container, + store::{ArrayStore, BitmapStore, Store}, +}; + +const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +const SERIAL_COOKIE: u16 = 12347; + +impl RoaringBitmap { + pub fn union_with_serialized(&mut self, mut reader: impl Read) -> io::Result<()> { + let (size, has_offsets) = { + let cookie = reader.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (reader.read_u32::()? as usize, true) + } else if (cookie as u16) == SERIAL_COOKIE { + return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + if size > u16::MAX as usize + 1 { + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); + } + + let mut description_bytes = vec![0u8; size * 4]; + reader.read_exact(&mut description_bytes)?; + let mut description_bytes = &description_bytes[..]; + + if has_offsets { + let mut offsets = vec![0u8; size * 4]; + reader.read_exact(&mut offsets)?; + drop(offsets); // Not useful when deserializing into memory + } + + for _ in 0..size { + let key = description_bytes.read_u16::()?; + let len = u64::from(description_bytes.read_u16::()?) + 1; + + if len <= 4096 { + match self.containers.binary_search_by_key(&key, |c| c.key) { + Ok(loc) => { + let container = &mut self.containers[loc]; + + for _ in 0..len { + let mut value = [0u8; size_of::()]; + reader.read_exact(value.as_mut())?; + // TODO: since this is sorted it could probably be faster + let value = u16::from_le_bytes(value); + container.insert(value); + } + } + Err(loc) => { + let mut values = vec![0u16; len as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + + let array = ArrayStore::from_vec_unchecked(values); + let mut container = Container::new(key); + container.store = Store::Array(array); + self.containers.insert(loc, container); + } + } + } else { + match self.containers.binary_search_by_key(&key, |c| c.key) { + Ok(loc) => { + let current_store = std::mem::take(&mut self.containers[loc].store); + + let mut values = Box::new([0; 1024]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + + let mut store = BitmapStore::from_unchecked(len, values); + + match current_store { + Store::Array(array) => array.into_iter().for_each(|el| { + store.insert(el); + }), + Store::Bitmap(bitmap_store) => store |= &bitmap_store, + }; + + self.containers[loc].store = Store::Bitmap(store); + } + Err(loc) => { + let mut values = Box::new([0; 1024]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + + let array = BitmapStore::from_unchecked(len, values); + let mut container = Container::new(key); + container.store = Store::Bitmap(array); + self.containers.insert(loc, container); + } + } + }; + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use crate::RoaringBitmap; + use proptest::prelude::*; + + proptest! { + #[test] + fn prop_or_with_serialized( + mut a in RoaringBitmap::arbitrary(), + b in RoaringBitmap::arbitrary() + ) { + let union = &a | &b; + + let mut b_ser = Vec::new(); + b.serialize_into(&mut b_ser).unwrap(); + a.union_with_serialized(&*b_ser).unwrap(); + + prop_assert_eq!(a, union); + } + } + + #[test] + fn or_with_serialized() { + let unions = [ + (RoaringBitmap::new(), RoaringBitmap::new()), + (RoaringBitmap::from_sorted_iter([0]).unwrap(), RoaringBitmap::new()), + (RoaringBitmap::new(), RoaringBitmap::from_sorted_iter([0]).unwrap()), + ( + RoaringBitmap::from_sorted_iter([0]).unwrap(), + RoaringBitmap::from_sorted_iter([0]).unwrap(), + ), + ( + RoaringBitmap::from_sorted_iter([0]).unwrap(), + RoaringBitmap::from_sorted_iter([1]).unwrap(), + ), + ( + RoaringBitmap::from_sorted_iter([0]).unwrap(), + RoaringBitmap::from_sorted_iter(0..3000).unwrap(), + ), + ( + RoaringBitmap::from_sorted_iter([]).unwrap(), + RoaringBitmap::from_sorted_iter(0..3000).unwrap(), + ), + ( + RoaringBitmap::from_sorted_iter(0..3000).unwrap(), + RoaringBitmap::from_sorted_iter([3001]).unwrap(), + ), + ( + RoaringBitmap::from_sorted_iter(0..3000).unwrap(), + RoaringBitmap::from_sorted_iter(3000..6000).unwrap(), + ), + ]; + + for (mut a, b) in unions { + let union = &a | &b; + + let mut b_ser = Vec::new(); + b.serialize_into(&mut b_ser).unwrap(); + a.union_with_serialized(&*b_ser).unwrap(); + + assert_eq!(a, union, "When testing: {a:?} | {b:?}"); + } + } +}