From 9d1c8e72300338bc2e5068f0e9699af386caf8ce Mon Sep 17 00:00:00 2001 From: Melody Horn Date: Thu, 1 Apr 2021 11:55:57 -0600 Subject: ideally, but clearly not actually, implement NFA-to-DFA --- bird-machine-macros/src/nfa.rs | 175 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 169 insertions(+), 6 deletions(-) (limited to 'bird-machine-macros/src/nfa.rs') diff --git a/bird-machine-macros/src/nfa.rs b/bird-machine-macros/src/nfa.rs index b7b3612..833353e 100644 --- a/bird-machine-macros/src/nfa.rs +++ b/bird-machine-macros/src/nfa.rs @@ -1,15 +1,178 @@ -use std::collections::{HashMap, HashSet}; +use std::cmp; +use std::collections::{BTreeMap, BTreeSet, VecDeque}; +use std::iter::repeat_with; +use std::ops::RangeInclusive; +use regex_syntax::hir::{Class, Hir, HirKind, Literal, RepetitionKind, RepetitionRange}; + +#[derive(Eq, PartialEq, Clone)] +pub struct Range(pub RangeInclusive); + +impl PartialOrd for Range { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Range { + fn cmp(&self, other: &Self) -> cmp::Ordering { + let start_order = self.0.start().cmp(other.0.start()); + start_order.then_with(|| self.0.end().cmp(other.0.end())) + } +} + +/// A nondeterministic finite automaton. +/// By convention, always starts in state 0. #[derive(Default)] pub struct NFA { - state_count: usize, - transition_table: HashMap, HashSet>, + pub state_count: usize, + pub transition_table: BTreeMap, BTreeSet>>, + pub accept_states: BTreeSet, +} + +impl From<&Hir> for NFA { + fn from(hir: &Hir) -> Self { + match hir.kind() { + HirKind::Empty => { + // Conventionally, this matches only the empty string, + // but the regex crate has implicit `.*?` at the front and `.*` + // at the back of every regex. + // TODO figure out what to do with that + let mut result = NFA::default(); + result.state_count = 1; + result.accept_states.insert(0); + result + } + HirKind::Literal(literal) => match literal { + Literal::Unicode(ch) => { + let ch = *ch; + let mut result = NFA::default(); + result.state_count = 2; + result.accept_states.insert(1); + let range = Range(ch..=ch); + result + .transition_table + .entry(0) + .or_default() + .entry(Some(range)) + .or_default() + .insert(1); + result + } + l => todo!("{:?}", l), + }, + HirKind::Class(class) => match class { + Class::Unicode(class) => { + let mut result = NFA::default(); + result.state_count = 2; + result.accept_states.insert(1); + for range in class.iter() { + let range = Range(range.start()..=range.end()); + result + .transition_table + .entry(0) + .or_default() + .entry(Some(range)) + .or_default() + .insert(1); + } + result + } + c => todo!("{:?}", c), + }, + HirKind::Anchor(_) => { + // TODO actually fucking implement this + let mut result = NFA::default(); + result.state_count = 1; + result.accept_states.insert(0); + result + } + HirKind::Repetition(rep) => match &rep.kind { + RepetitionKind::Range(range) => match range { + RepetitionRange::Exactly(n) => repeat_with(|| NFA::from(rep.hir.as_ref())) + .take(*n as usize) + .reduce(NFA::concat) + .expect("exactly 0 of something"), + r => todo!("{:?}", r), + }, + k => todo!("{:?}", k), + }, + HirKind::Concat(elements) => elements + .iter() + .map(NFA::from) + .reduce(NFA::concat) + .expect("empty HirKind::Concat"), + k => todo!("{:?}", k), + } + } } impl NFA { - pub fn new_state(&mut self) -> usize { - let result = self.state_count; - self.state_count += 1; + fn concat(mut self, other: NFA) -> NFA { + // absorb the other automaton's states + let other_state_offset = self.state_count; + self.state_count += other.state_count; + self.transition_table + .extend( + other + .transition_table + .into_iter() + .map(|(state, table_for_state)| { + ( + state + other_state_offset, + table_for_state + .into_iter() + .map(|(char_range, next_states)| { + ( + char_range, + next_states + .into_iter() + .map(|s| s + other_state_offset) + .collect(), + ) + }) + .collect(), + ) + }), + ); + // for all the accept states of this machine... + for &self_accept in &self.accept_states { + // add an ε-transition from that state to the start of the other machine + self.transition_table + .entry(self_accept) + .or_default() + .entry(None) + .or_default() + .insert(other_state_offset); + } + // steal the other machine's accept states + self.accept_states = other.accept_states; + self + } + + pub fn epsilon_closure(&self, state: usize) -> BTreeSet { + let mut set = BTreeSet::new(); + set.insert(state); + self.epsilon_closure_all(set) + } + + pub fn epsilon_closure_all(&self, states: impl IntoIterator) -> BTreeSet { + let mut result = BTreeSet::new(); + let mut queue: VecDeque = VecDeque::new(); + queue.extend(states); + while let Some(next_state) = queue.pop_front() { + result.insert(next_state); + let states_after_that = self + .transition_table + .get(&next_state) + .and_then(|state_table| state_table.get(&None)) + .map(|next_states| next_states.iter()) + .into_iter() + .flatten() + .copied() + .filter(|s| !result.contains(s)); + queue.extend(states_after_that); + } result } } -- cgit v1.2.3