diff --git a/Cargo.lock b/Cargo.lock index ad8a5b4..8f841c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,74 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "general-sam" version = "0.2.0" +dependencies = [ + "rand", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "libc" +version = "0.2.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" diff --git a/Cargo.toml b/Cargo.toml index 96d5f6c..2bb03a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,6 @@ exclude = ["release-plz.toml", "cliff.tolm"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] name = "general_sam" + +[dev-dependencies] +rand = "0.8.5" diff --git a/README.md b/README.md index cae909a..0982873 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,10 @@ use general_sam::sam::GeneralSAM; let sam = GeneralSAM::construct_from_bytes("abcbc"); // => GeneralSAM -// "cbc" is a suffix. +// "cbc" is a suffix of "abcbc" assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); -// "bcb" isn't a suffix. +// "bcb" is not a suffix of "abcbc" assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting()); ``` @@ -57,19 +57,19 @@ let sam = GeneralSAM::construct_from_chars("abcbc".chars()); let state = sam.get_root_state(); -// "b" is not a suffix but a substring. +// "b" is not a suffix but at least a substring of "abcbc" let state = state.feed_chars("b"); assert!(!state.is_accepting()); -// "bc" is a suffix. +// "bc" is a suffix of "abcbc" let state = state.feed_chars("c"); assert!(state.is_accepting()); -// "bcbc" is also a suffix. +// "bcbc" is a suffix of "abcbc" let state = state.feed_chars("bc"); assert!(state.is_accepting()); -// "bcbcbc" is not a substring. +// "bcbcbc" is not a substring, much less a suffix of "abcbc" let state = state.feed_chars("bc"); assert!(!state.is_accepting() && state.is_nil()); ``` diff --git a/src/lib.rs b/src/lib.rs index 8fef252..8f91480 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,28 @@ //! This crate provides an implementation of a general suffix automaton. //! -//! | [![the suffix automaton of abcbc][sam-of-abcbc]][sam-oi-wiki] | -//! | :----------------------------------------------------------------------------: | -//! | The suffix automaton of abcbc, image from [后缀自动机 - OI Wiki][sam-oi-wiki]. | +//! ```mermaid +//! flowchart LR +//! init((ε)) +//! a((a)) +//! b((b)) +//! ab((ab)) +//! bc(((bc))) +//! abc((abc)) +//! abcb((abcb)) +//! abcbc(((abcbc))) +//! +//! init -- a --> a +//! init -- b --> b +//! a -- b --> ab +//! b -- c --> bc +//! init -- c --> bc +//! ab -- c --> abc +//! bc -- b --> abcb +//! abc -- b --> abcb +//! abcb -- c --> abcbc +//! ``` //! -//! [sam-of-abcbc]: https://oi-wiki.org/string/images/SAM/SA_suffix_links.svg -//! [sam-oi-wiki]: https://oi-wiki.org/string/sam/ +//! > The suffix automaton of abcbc. //! //! # Examples //! @@ -15,7 +32,10 @@ //! let sam = GeneralSAM::construct_from_bytes("abcbc"); //! // => GeneralSAM //! +//! // "cbc" is a suffix of "abcbc" //! assert!(sam.get_root_state().feed_bytes("cbc").is_accepting()); +//! +//! // "bcb" is not a suffix of "abcbc" //! assert!(!sam.get_root_state().feed_bytes("bcb").is_accepting()); //! ``` //! @@ -26,12 +46,20 @@ //! // => GeneralSAM //! //! let state = sam.get_root_state(); +//! +//! // "b" is not a suffix but at least a substring of "abcbc" //! let state = state.feed_chars("b"); //! assert!(!state.is_accepting()); +//! +//! // "bc" is a suffix of "abcbc" //! let state = state.feed_chars("c"); //! assert!(state.is_accepting()); +//! +//! // "bcbc" is a suffix of "abcbc" //! let state = state.feed_chars("bc"); //! assert!(state.is_accepting()); +//! +//! // "bcbcbc" is not a substring, much less a suffix of "abcbc" //! let state = state.feed_chars("bc"); //! assert!(!state.is_accepting() && state.is_nil()); //! ``` diff --git a/src/sam/mod.rs b/src/sam/mod.rs index 30b1ce9..881879a 100644 --- a/src/sam/mod.rs +++ b/src/sam/mod.rs @@ -23,7 +23,7 @@ pub struct GeneralSAMNode { #[derive(Debug, Clone)] pub struct GeneralSAM { node_pool: Vec>, - topo_order: Vec, + topo_and_suf_len_sorted_order: Vec, } impl GeneralSAMNode { @@ -81,7 +81,7 @@ impl Default for GeneralSAM { GeneralSAMNode::new(false, 0, SAM_NIL_NODE_ID), GeneralSAMNode::new(true, 0, SAM_NIL_NODE_ID), ], - topo_order: Default::default(), + topo_and_suf_len_sorted_order: Default::default(), } } } @@ -91,6 +91,14 @@ impl GeneralSAM { self.node_pool.len() } + pub fn get_root_node(&self) -> &GeneralSAMNode { + self.get_node(SAM_ROOT_NODE_ID).unwrap() + } + + pub fn get_node(&self, node_id: GeneralSAMNodeID) -> Option<&GeneralSAMNode> { + self.node_pool.get(node_id) + } + pub fn get_root_state(&self) -> GeneralSAMState { self.get_state(SAM_ROOT_NODE_ID) } @@ -106,8 +114,11 @@ impl GeneralSAM { } } - pub fn get_topo_sorted_node_ids(&self) -> &Vec { - &self.topo_order + /// Returns topological sorted, maximum suffix length sorted + /// and suffix parent depth sorted node id sequence, + /// which is generated by topological sorting with a queue. + pub fn get_topo_and_suf_len_sorted_node_ids(&self) -> &Vec { + &self.topo_and_suf_len_sorted_order } pub fn construct_from_trie(node: TN) -> Self @@ -119,7 +130,7 @@ impl GeneralSAM { let accept_empty_string = node.is_accepting(); sam.build_with_trie(node); - sam.topo_sort(); + sam.topo_sort_with_queue(); sam.update_accepting(); sam.node_pool[SAM_ROOT_NODE_ID].accept = accept_empty_string; @@ -151,9 +162,9 @@ impl GeneralSAM { .unwrap(); } - fn topo_sort(&mut self) { - let mut in_degree: Vec = Vec::new(); - in_degree.resize(self.node_pool.len(), 0); + fn topo_sort_with_queue(&mut self) { + let mut in_degree: Vec = vec![0; self.num_of_nodes()]; + self.node_pool.iter().for_each(|node| { node.trans.values().for_each(|v| { in_degree[*v] += 1; @@ -161,27 +172,31 @@ impl GeneralSAM { }); assert!(in_degree[SAM_ROOT_NODE_ID] == 0); - self.topo_order.reserve(self.node_pool.len()); + self.topo_and_suf_len_sorted_order + .reserve(self.node_pool.len()); - self.topo_order.push(SAM_ROOT_NODE_ID); + self.topo_and_suf_len_sorted_order.push(SAM_ROOT_NODE_ID); let mut head = 0; - while head < self.topo_order.len() { - let u_id = self.topo_order[head]; + while head < self.topo_and_suf_len_sorted_order.len() { + let u_id = self.topo_and_suf_len_sorted_order[head]; head += 1; self.node_pool[u_id].trans.values().for_each(|v_id| { in_degree[*v_id] -= 1; if in_degree[*v_id] == 0 { - self.topo_order.push(*v_id); + self.topo_and_suf_len_sorted_order.push(*v_id); } }); } } fn update_accepting(&mut self) { - self.topo_order.iter().rev().for_each(|node_id| { - let link_id = self.node_pool[*node_id].link; - self.node_pool[link_id].accept |= self.node_pool[*node_id].accept; - }); + self.topo_and_suf_len_sorted_order + .iter() + .rev() + .for_each(|node_id| { + let link_id = self.node_pool[*node_id].link; + self.node_pool[link_id].accept |= self.node_pool[*node_id].accept; + }); self.node_pool[SAM_NIL_NODE_ID].accept = false; } diff --git a/src/sam/state.rs b/src/sam/state.rs index 7708613..d9fbaad 100644 --- a/src/sam/state.rs +++ b/src/sam/state.rs @@ -37,8 +37,8 @@ impl<'s, T: Ord + Clone> GeneralSAMState<'s, T> { .unwrap_or(false) } - pub fn get_node(&self) -> Option<&'_ GeneralSAMNode> { - self.sam.node_pool.get(self.node_id) + pub fn get_node(&self) -> Option<&GeneralSAMNode> { + self.sam.get_node(self.node_id) } pub fn goto_suffix_parent(&mut self) { diff --git a/src/tests.rs b/src/tests.rs index fbeab67..8aa703d 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,4 +1,10 @@ -use crate::{sam::GeneralSAM, trie::Trie}; +use rand::{ + distributions::{Alphanumeric, DistString}, + rngs::StdRng, + Rng, SeedableRng, +}; + +use crate::{sam::GeneralSAM, trie::Trie, SAM_ROOT_NODE_ID}; #[test] fn test_example_from_chars() { @@ -165,3 +171,53 @@ fn test_simple_trie_suffix() { let vocab = ["ac", "bb", "b", "cc", "aabb", "a", "ba", "c", "aa"]; test_trie_suffix(&vocab); } + +#[test] +fn test_topo_and_suf_len_sorted_order() { + let mut rng = StdRng::seed_from_u64(1134759173975); + for _ in 0..10000 { + let mut trie = Trie::default(); + for _ in 0..rng.gen_range(0..32) { + let len = rng.gen_range(0..9); + let string = Alphanumeric.sample_string(&mut rng, len); + trie.insert_ref_iter(string.as_bytes().iter()); + } + + let sam: GeneralSAM = GeneralSAM::construct_from_trie(trie.get_root_state()); + + let order = sam.get_topo_and_suf_len_sorted_node_ids(); + let rank = { + let mut rank = vec![0; sam.num_of_nodes()]; + order.iter().enumerate().for_each(|(k, i)| { + rank[*i] = k; + }); + rank + }; + + // verify that max suffix lengths should be sorted + for pos in 0..order.len() - 1 { + assert!( + sam.get_node(order[pos]).unwrap().max_suffix_len() + <= sam.get_node(order[pos + 1]).unwrap().max_suffix_len() + ); + } + + // verify topological ordering + order.iter().for_each(|node_id| { + let node = sam.get_node(*node_id).unwrap(); + + node.get_trans().values().for_each(|next_node_id| { + assert!(rank[*next_node_id] > rank[*node_id]); + }); + }); + + // verify suffix parent tree depth ordering + order.iter().for_each(|node_id| { + let node = sam.get_node(*node_id).unwrap(); + + if *node_id != SAM_ROOT_NODE_ID { + assert!(rank[node.get_suffix_parent_id()] < rank[*node_id]); + } + }); + } +} diff --git a/src/trie.rs b/src/trie.rs index d84269e..f1c531a 100644 --- a/src/trie.rs +++ b/src/trie.rs @@ -45,7 +45,10 @@ impl TrieNode { impl Default for Trie { fn default() -> Self { Self { - node_pool: vec![TrieNode::new(TRIE_NIL_NODE_ID), TrieNode::new(TRIE_NIL_NODE_ID)], + node_pool: vec![ + TrieNode::new(TRIE_NIL_NODE_ID), + TrieNode::new(TRIE_NIL_NODE_ID), + ], } } } @@ -86,7 +89,10 @@ impl Trie { node_id } - pub fn insert_ref_iter<'s, Iter: Iterator>(&'s mut self, iter: Iter) -> TrieNodeID { + pub fn insert_ref_iter<'s, Iter: Iterator>( + &'s mut self, + iter: Iter, + ) -> TrieNodeID { self.insert_iter(iter.cloned()) }