diff options
author | Melody Horn <melody@boringcactus.com> | 2021-04-01 11:55:57 -0600 |
---|---|---|
committer | Melody Horn <melody@boringcactus.com> | 2021-04-01 11:55:57 -0600 |
commit | 9d1c8e72300338bc2e5068f0e9699af386caf8ce (patch) | |
tree | f206ca6c9a3a0c2bb0cf5aaeda9bc4291dbcbfa0 /bird-machine-macros | |
parent | 24207feb7726bd2db97693eb8fdd155d33612574 (diff) | |
download | bird-machine-9d1c8e72300338bc2e5068f0e9699af386caf8ce.tar.gz bird-machine-9d1c8e72300338bc2e5068f0e9699af386caf8ce.zip |
ideally, but clearly not actually, implement NFA-to-DFA
Diffstat (limited to 'bird-machine-macros')
-rw-r--r-- | bird-machine-macros/src/dfa.rs | 156 | ||||
-rw-r--r-- | bird-machine-macros/src/lib.rs | 34 | ||||
-rw-r--r-- | bird-machine-macros/src/nfa.rs | 175 |
3 files changed, 345 insertions, 20 deletions
diff --git a/bird-machine-macros/src/dfa.rs b/bird-machine-macros/src/dfa.rs new file mode 100644 index 0000000..0aec059 --- /dev/null +++ b/bird-machine-macros/src/dfa.rs @@ -0,0 +1,156 @@ +use std::collections::{BTreeMap, BTreeSet, VecDeque}; + +use proc_macro2::TokenStream; +use quote::{quote, ToTokens}; + +use super::nfa::{Range, NFA}; + +/// A nondeterministic finite automaton. +/// By convention, always starts in state 0. +pub struct DFA { + transition_table: BTreeMap<usize, BTreeMap<Range, usize>>, + accept_states: BTreeSet<usize>, +} + +struct LabeledDFA<Label: Ord> { + start_state: Label, + transition_table: BTreeMap<Label, BTreeMap<Range, Label>>, + accept_states: BTreeSet<Label>, +} + +impl From<NFA> for LabeledDFA<BTreeSet<usize>> { + fn from(nfa: NFA) -> Self { + type StateSet = BTreeSet<usize>; + + let mut transition_table: BTreeMap<StateSet, BTreeMap<Range, StateSet>> = BTreeMap::new(); + + let mut queue: VecDeque<StateSet> = VecDeque::new(); + + let start_state = nfa.epsilon_closure(0); + queue.push_back(start_state.clone()); + + while let Some(state_set) = queue.pop_front() { + let state_set = nfa.epsilon_closure_all(state_set); + + let mut next_table: BTreeMap<Range, StateSet> = BTreeMap::new(); + for &state in &state_set { + let table_entry = nfa.transition_table.get(&state); + let non_epsilon_table_entries = table_entry + .iter() + .flat_map(|x| x.iter()) + .filter_map(|(c, s)| c.as_ref().map(|c| (c.clone(), s))); + for (c, next_states) in non_epsilon_table_entries { + // TODO check for non-disjoint ranges + next_table + .entry(c) + .or_default() + .extend(nfa.epsilon_closure_all(next_states.iter().copied())) + } + } + + for states in next_table.values() { + if !transition_table.contains_key(states) { + queue.push_back(states.clone()); + } + } + transition_table.insert(state_set, next_table); + } + + let accept_states = transition_table + .keys() + .chain(transition_table.values().flat_map(|v| v.values())) + .filter(|s| s.iter().any(|s| nfa.accept_states.contains(s))) + .cloned() + .collect(); + + dbg!(&accept_states); + + Self { + start_state, + transition_table, + accept_states, + } + } +} + +impl<Label: Ord> From<LabeledDFA<Label>> for DFA { + fn from(dfa: LabeledDFA<Label>) -> Self { + let mut state_names = BTreeMap::new(); + state_names.insert(&dfa.start_state, 0usize); + let mut next_name = 1usize; + + for reachable_state in dfa.transition_table.values().flat_map(|x| x.values()) { + state_names.entry(reachable_state).or_insert_with(|| { + let result = next_name; + next_name = next_name.saturating_add(1); + result + }); + } + + let transition_table = dfa + .transition_table + .iter() + .map(|(state, table)| { + ( + state_names[&state], + table + .iter() + .map(|(c, s)| (c.clone(), state_names[&s])) + .collect(), + ) + }) + .collect(); + let accept_states = state_names + .iter() + .filter(|(s, _)| dfa.accept_states.contains(s)) + .map(|(_, n)| n) + .copied() + .collect(); + + Self { + transition_table, + accept_states, + } + } +} + +impl From<NFA> for DFA { + fn from(a: NFA) -> Self { + let intermediate = LabeledDFA::from(a); + Self::from(intermediate) + } +} + +impl ToTokens for Range { + fn to_tokens(&self, tokens: &mut TokenStream) { + let start = self.0.start(); + let end = self.0.end(); + let result = quote!(#start..=#end); + result.to_tokens(tokens); + } +} + +impl ToTokens for DFA { + fn to_tokens(&self, tokens: &mut TokenStream) { + let transition = self + .transition_table + .iter() + .flat_map(|(current_state, table)| { + table + .iter() + .map(move |(ch, next_state)| quote!((#current_state, #ch) => #next_state,)) + }); + let accepts = self.accept_states.iter(); + let result = quote! { + let mut state = 0usize; + for c in input.chars() { + state = match (state, c) { + #(#transition)* + _ => return false, + }; + } + matches!(state, #(#accepts)|*) + }; + result.to_tokens(tokens); + } +} diff --git a/bird-machine-macros/src/lib.rs b/bird-machine-macros/src/lib.rs index 628ee91..c6df41c 100644 --- a/bird-machine-macros/src/lib.rs +++ b/bird-machine-macros/src/lib.rs @@ -1,9 +1,11 @@ extern crate proc_macro; use proc_macro::TokenStream; +use proc_macro2::Ident; +use quote::{format_ident, quote, ToTokens}; use syn::{parse_macro_input, DeriveInput, LitStr}; -use quote::{quote, ToTokens}; +mod dfa; mod nfa; #[proc_macro_attribute] @@ -14,13 +16,14 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream { let input_type_name = &input.ident; let input_lifetimes: Vec<_> = input.generics.lifetimes().collect(); let lifetime = match input_lifetimes.as_slice() { - [] => quote!{ 'unrestricted }, - [lt] => quote!{ #lt }, + [] => quote! { 'unrestricted }, + [lt] => quote! { #lt }, _ => panic!("multiple lifetime generics, what is this, pls to halp"), }; - let machine = build_machine(&input_regex); - dbg!(&machine); + let machine_fn = format_ident!("bird_machine_for_{}", input_type_name); + let machine = build_machine(&input_regex, &machine_fn); + eprintln!("{}", &machine); let (_, ty_generics, where_clause) = input.generics.split_for_impl(); @@ -59,7 +62,7 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream { }; let is_match = quote! { fn is_match(text: &#lifetime str) -> bool { - todo!() + #machine_fn(text) } }; let is_match_at = quote! { @@ -162,18 +165,21 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream { tokens.into() } -fn build_machine(regex: &LitStr) -> proc_macro2::TokenStream { +fn build_machine(regex: &LitStr, fn_name: &Ident) -> proc_macro2::TokenStream { let regex_text = regex.value(); - let regex_ir = regex_syntax::Parser::new() - .parse(®ex_text); + let regex_ir = regex_syntax::Parser::new().parse(®ex_text); let regex_ir = match regex_ir { Ok(x) => x, - Err(err ) => panic!("error compiling regex {}: {}", regex.to_token_stream(), err), + Err(err) => panic!("error compiling regex {}: {}", regex.to_token_stream(), err), }; - dbg!(®ex_ir); - // shout out to all the professors who've taught me how to do this - let mut built_nfa = nfa::NFA::default(); + let built_nfa = nfa::NFA::from(®ex_ir); + dbg!("built the NFA"); + let dfa = dfa::DFA::from(built_nfa); - todo!() + quote! { + fn #fn_name(input: &str) -> bool { + #dfa + } + } } diff --git a/bird-machine-macros/src/nfa.rs b/bird-machine-macros/src/nfa.rs index b7b3612..833353e 100644 --- a/bird-machine-macros/src/nfa.rs +++ b/bird-machine-macros/src/nfa.rs @@ -1,15 +1,178 @@ -use std::collections::{HashMap, HashSet}; +use std::cmp; +use std::collections::{BTreeMap, BTreeSet, VecDeque}; +use std::iter::repeat_with; +use std::ops::RangeInclusive; +use regex_syntax::hir::{Class, Hir, HirKind, Literal, RepetitionKind, RepetitionRange}; + +#[derive(Eq, PartialEq, Clone)] +pub struct Range(pub RangeInclusive<char>); + +impl PartialOrd for Range { + fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for Range { + fn cmp(&self, other: &Self) -> cmp::Ordering { + let start_order = self.0.start().cmp(other.0.start()); + start_order.then_with(|| self.0.end().cmp(other.0.end())) + } +} + +/// A nondeterministic finite automaton. +/// By convention, always starts in state 0. #[derive(Default)] pub struct NFA { - state_count: usize, - transition_table: HashMap<Option<char>, HashSet<usize>>, + pub state_count: usize, + pub transition_table: BTreeMap<usize, BTreeMap<Option<Range>, BTreeSet<usize>>>, + pub accept_states: BTreeSet<usize>, +} + +impl From<&Hir> for NFA { + fn from(hir: &Hir) -> Self { + match hir.kind() { + HirKind::Empty => { + // Conventionally, this matches only the empty string, + // but the regex crate has implicit `.*?` at the front and `.*` + // at the back of every regex. + // TODO figure out what to do with that + let mut result = NFA::default(); + result.state_count = 1; + result.accept_states.insert(0); + result + } + HirKind::Literal(literal) => match literal { + Literal::Unicode(ch) => { + let ch = *ch; + let mut result = NFA::default(); + result.state_count = 2; + result.accept_states.insert(1); + let range = Range(ch..=ch); + result + .transition_table + .entry(0) + .or_default() + .entry(Some(range)) + .or_default() + .insert(1); + result + } + l => todo!("{:?}", l), + }, + HirKind::Class(class) => match class { + Class::Unicode(class) => { + let mut result = NFA::default(); + result.state_count = 2; + result.accept_states.insert(1); + for range in class.iter() { + let range = Range(range.start()..=range.end()); + result + .transition_table + .entry(0) + .or_default() + .entry(Some(range)) + .or_default() + .insert(1); + } + result + } + c => todo!("{:?}", c), + }, + HirKind::Anchor(_) => { + // TODO actually fucking implement this + let mut result = NFA::default(); + result.state_count = 1; + result.accept_states.insert(0); + result + } + HirKind::Repetition(rep) => match &rep.kind { + RepetitionKind::Range(range) => match range { + RepetitionRange::Exactly(n) => repeat_with(|| NFA::from(rep.hir.as_ref())) + .take(*n as usize) + .reduce(NFA::concat) + .expect("exactly 0 of something"), + r => todo!("{:?}", r), + }, + k => todo!("{:?}", k), + }, + HirKind::Concat(elements) => elements + .iter() + .map(NFA::from) + .reduce(NFA::concat) + .expect("empty HirKind::Concat"), + k => todo!("{:?}", k), + } + } } impl NFA { - pub fn new_state(&mut self) -> usize { - let result = self.state_count; - self.state_count += 1; + fn concat(mut self, other: NFA) -> NFA { + // absorb the other automaton's states + let other_state_offset = self.state_count; + self.state_count += other.state_count; + self.transition_table + .extend( + other + .transition_table + .into_iter() + .map(|(state, table_for_state)| { + ( + state + other_state_offset, + table_for_state + .into_iter() + .map(|(char_range, next_states)| { + ( + char_range, + next_states + .into_iter() + .map(|s| s + other_state_offset) + .collect(), + ) + }) + .collect(), + ) + }), + ); + // for all the accept states of this machine... + for &self_accept in &self.accept_states { + // add an ε-transition from that state to the start of the other machine + self.transition_table + .entry(self_accept) + .or_default() + .entry(None) + .or_default() + .insert(other_state_offset); + } + // steal the other machine's accept states + self.accept_states = other.accept_states; + self + } + + pub fn epsilon_closure(&self, state: usize) -> BTreeSet<usize> { + let mut set = BTreeSet::new(); + set.insert(state); + self.epsilon_closure_all(set) + } + + pub fn epsilon_closure_all(&self, states: impl IntoIterator<Item = usize>) -> BTreeSet<usize> { + let mut result = BTreeSet::new(); + let mut queue: VecDeque<usize> = VecDeque::new(); + queue.extend(states); + while let Some(next_state) = queue.pop_front() { + result.insert(next_state); + let states_after_that = self + .transition_table + .get(&next_state) + .and_then(|state_table| state_table.get(&None)) + .map(|next_states| next_states.iter()) + .into_iter() + .flatten() + .copied() + .filter(|s| !result.contains(s)); + queue.extend(states_after_that); + } result } } |