use std::cmp; use std::collections::{BTreeMap, BTreeSet, VecDeque}; use std::iter::repeat_with; use std::ops::RangeInclusive; use regex_syntax::hir::{Class, Hir, HirKind, Literal, RepetitionKind, RepetitionRange}; #[derive(Eq, PartialEq, Clone)] pub struct Range(pub RangeInclusive); impl PartialOrd for Range { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for Range { fn cmp(&self, other: &Self) -> cmp::Ordering { let start_order = self.0.start().cmp(other.0.start()); start_order.then_with(|| self.0.end().cmp(other.0.end())) } } /// A nondeterministic finite automaton. /// By convention, always starts in state 0. #[derive(Default)] pub struct NFA { pub state_count: usize, pub transition_table: BTreeMap, BTreeSet>>, pub accept_states: BTreeSet, } impl From<&Hir> for NFA { fn from(hir: &Hir) -> Self { match hir.kind() { HirKind::Empty => { // Conventionally, this matches only the empty string, // but the regex crate has implicit `.*?` at the front and `.*` // at the back of every regex. // TODO figure out what to do with that let mut result = NFA::default(); result.state_count = 1; result.accept_states.insert(0); result } HirKind::Literal(literal) => match literal { Literal::Unicode(ch) => { let ch = *ch; let mut result = NFA::default(); result.state_count = 2; result.accept_states.insert(1); let range = Range(ch..=ch); result .transition_table .entry(0) .or_default() .entry(Some(range)) .or_default() .insert(1); result } l => todo!("{:?}", l), }, HirKind::Class(class) => match class { Class::Unicode(class) => { let mut result = NFA::default(); result.state_count = 2; result.accept_states.insert(1); for range in class.iter() { let range = Range(range.start()..=range.end()); result .transition_table .entry(0) .or_default() .entry(Some(range)) .or_default() .insert(1); } result } c => todo!("{:?}", c), }, HirKind::Anchor(_) => { // TODO actually fucking implement this let mut result = NFA::default(); result.state_count = 1; result.accept_states.insert(0); result } HirKind::Repetition(rep) => match &rep.kind { RepetitionKind::Range(range) => match range { RepetitionRange::Exactly(n) => repeat_with(|| NFA::from(rep.hir.as_ref())) .take(*n as usize) .reduce(NFA::concat) .expect("exactly 0 of something"), r => todo!("{:?}", r), }, k => todo!("{:?}", k), }, HirKind::Concat(elements) => elements .iter() .map(NFA::from) .reduce(NFA::concat) .expect("empty HirKind::Concat"), k => todo!("{:?}", k), } } } impl NFA { fn concat(mut self, other: NFA) -> NFA { // absorb the other automaton's states let other_state_offset = self.state_count; self.state_count += other.state_count; self.transition_table .extend( other .transition_table .into_iter() .map(|(state, table_for_state)| { ( state + other_state_offset, table_for_state .into_iter() .map(|(char_range, next_states)| { ( char_range, next_states .into_iter() .map(|s| s + other_state_offset) .collect(), ) }) .collect(), ) }), ); // for all the accept states of this machine... for &self_accept in &self.accept_states { // add an ε-transition from that state to the start of the other machine self.transition_table .entry(self_accept) .or_default() .entry(None) .or_default() .insert(other_state_offset); } // steal the other machine's accept states self.accept_states = other.accept_states; self } pub fn epsilon_closure(&self, state: usize) -> BTreeSet { let mut set = BTreeSet::new(); set.insert(state); self.epsilon_closure_all(set) } pub fn epsilon_closure_all(&self, states: impl IntoIterator) -> BTreeSet { let mut result = BTreeSet::new(); let mut queue: VecDeque = VecDeque::new(); queue.extend(states); while let Some(next_state) = queue.pop_front() { result.insert(next_state); let states_after_that = self .transition_table .get(&next_state) .and_then(|state_table| state_table.get(&None)) .map(|next_states| next_states.iter()) .into_iter() .flatten() .copied() .filter(|s| !result.contains(s)); queue.extend(states_after_that); } result } }