aboutsummaryrefslogtreecommitdiff
path: root/bird-machine-macros/src/nfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'bird-machine-macros/src/nfa.rs')
-rw-r--r--bird-machine-macros/src/nfa.rs175
1 files changed, 169 insertions, 6 deletions
diff --git a/bird-machine-macros/src/nfa.rs b/bird-machine-macros/src/nfa.rs
index b7b3612..833353e 100644
--- a/bird-machine-macros/src/nfa.rs
+++ b/bird-machine-macros/src/nfa.rs
@@ -1,15 +1,178 @@
-use std::collections::{HashMap, HashSet};
+use std::cmp;
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::iter::repeat_with;
+use std::ops::RangeInclusive;
+use regex_syntax::hir::{Class, Hir, HirKind, Literal, RepetitionKind, RepetitionRange};
+
+#[derive(Eq, PartialEq, Clone)]
+pub struct Range(pub RangeInclusive<char>);
+
+impl PartialOrd for Range {
+ fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for Range {
+ fn cmp(&self, other: &Self) -> cmp::Ordering {
+ let start_order = self.0.start().cmp(other.0.start());
+ start_order.then_with(|| self.0.end().cmp(other.0.end()))
+ }
+}
+
+/// A nondeterministic finite automaton.
+/// By convention, always starts in state 0.
#[derive(Default)]
pub struct NFA {
- state_count: usize,
- transition_table: HashMap<Option<char>, HashSet<usize>>,
+ pub state_count: usize,
+ pub transition_table: BTreeMap<usize, BTreeMap<Option<Range>, BTreeSet<usize>>>,
+ pub accept_states: BTreeSet<usize>,
+}
+
+impl From<&Hir> for NFA {
+ fn from(hir: &Hir) -> Self {
+ match hir.kind() {
+ HirKind::Empty => {
+ // Conventionally, this matches only the empty string,
+ // but the regex crate has implicit `.*?` at the front and `.*`
+ // at the back of every regex.
+ // TODO figure out what to do with that
+ let mut result = NFA::default();
+ result.state_count = 1;
+ result.accept_states.insert(0);
+ result
+ }
+ HirKind::Literal(literal) => match literal {
+ Literal::Unicode(ch) => {
+ let ch = *ch;
+ let mut result = NFA::default();
+ result.state_count = 2;
+ result.accept_states.insert(1);
+ let range = Range(ch..=ch);
+ result
+ .transition_table
+ .entry(0)
+ .or_default()
+ .entry(Some(range))
+ .or_default()
+ .insert(1);
+ result
+ }
+ l => todo!("{:?}", l),
+ },
+ HirKind::Class(class) => match class {
+ Class::Unicode(class) => {
+ let mut result = NFA::default();
+ result.state_count = 2;
+ result.accept_states.insert(1);
+ for range in class.iter() {
+ let range = Range(range.start()..=range.end());
+ result
+ .transition_table
+ .entry(0)
+ .or_default()
+ .entry(Some(range))
+ .or_default()
+ .insert(1);
+ }
+ result
+ }
+ c => todo!("{:?}", c),
+ },
+ HirKind::Anchor(_) => {
+ // TODO actually fucking implement this
+ let mut result = NFA::default();
+ result.state_count = 1;
+ result.accept_states.insert(0);
+ result
+ }
+ HirKind::Repetition(rep) => match &rep.kind {
+ RepetitionKind::Range(range) => match range {
+ RepetitionRange::Exactly(n) => repeat_with(|| NFA::from(rep.hir.as_ref()))
+ .take(*n as usize)
+ .reduce(NFA::concat)
+ .expect("exactly 0 of something"),
+ r => todo!("{:?}", r),
+ },
+ k => todo!("{:?}", k),
+ },
+ HirKind::Concat(elements) => elements
+ .iter()
+ .map(NFA::from)
+ .reduce(NFA::concat)
+ .expect("empty HirKind::Concat"),
+ k => todo!("{:?}", k),
+ }
+ }
}
impl NFA {
- pub fn new_state(&mut self) -> usize {
- let result = self.state_count;
- self.state_count += 1;
+ fn concat(mut self, other: NFA) -> NFA {
+ // absorb the other automaton's states
+ let other_state_offset = self.state_count;
+ self.state_count += other.state_count;
+ self.transition_table
+ .extend(
+ other
+ .transition_table
+ .into_iter()
+ .map(|(state, table_for_state)| {
+ (
+ state + other_state_offset,
+ table_for_state
+ .into_iter()
+ .map(|(char_range, next_states)| {
+ (
+ char_range,
+ next_states
+ .into_iter()
+ .map(|s| s + other_state_offset)
+ .collect(),
+ )
+ })
+ .collect(),
+ )
+ }),
+ );
+ // for all the accept states of this machine...
+ for &self_accept in &self.accept_states {
+ // add an ε-transition from that state to the start of the other machine
+ self.transition_table
+ .entry(self_accept)
+ .or_default()
+ .entry(None)
+ .or_default()
+ .insert(other_state_offset);
+ }
+ // steal the other machine's accept states
+ self.accept_states = other.accept_states;
+ self
+ }
+
+ pub fn epsilon_closure(&self, state: usize) -> BTreeSet<usize> {
+ let mut set = BTreeSet::new();
+ set.insert(state);
+ self.epsilon_closure_all(set)
+ }
+
+ pub fn epsilon_closure_all(&self, states: impl IntoIterator<Item = usize>) -> BTreeSet<usize> {
+ let mut result = BTreeSet::new();
+ let mut queue: VecDeque<usize> = VecDeque::new();
+ queue.extend(states);
+ while let Some(next_state) = queue.pop_front() {
+ result.insert(next_state);
+ let states_after_that = self
+ .transition_table
+ .get(&next_state)
+ .and_then(|state_table| state_table.get(&None))
+ .map(|next_states| next_states.iter())
+ .into_iter()
+ .flatten()
+ .copied()
+ .filter(|s| !result.contains(s));
+ queue.extend(states_after_that);
+ }
result
}
}