aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMelody Horn <melody@boringcactus.com>2021-04-01 11:55:57 -0600
committerMelody Horn <melody@boringcactus.com>2021-04-01 11:55:57 -0600
commit9d1c8e72300338bc2e5068f0e9699af386caf8ce (patch)
treef206ca6c9a3a0c2bb0cf5aaeda9bc4291dbcbfa0
parent24207feb7726bd2db97693eb8fdd155d33612574 (diff)
downloadbird-machine-9d1c8e72300338bc2e5068f0e9699af386caf8ce.tar.gz
bird-machine-9d1c8e72300338bc2e5068f0e9699af386caf8ce.zip
ideally, but clearly not actually, implement NFA-to-DFA
-rw-r--r--bird-machine-macros/src/dfa.rs156
-rw-r--r--bird-machine-macros/src/lib.rs34
-rw-r--r--bird-machine-macros/src/nfa.rs175
3 files changed, 345 insertions, 20 deletions
diff --git a/bird-machine-macros/src/dfa.rs b/bird-machine-macros/src/dfa.rs
new file mode 100644
index 0000000..0aec059
--- /dev/null
+++ b/bird-machine-macros/src/dfa.rs
@@ -0,0 +1,156 @@
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
+
+use proc_macro2::TokenStream;
+use quote::{quote, ToTokens};
+
+use super::nfa::{Range, NFA};
+
+/// A nondeterministic finite automaton.
+/// By convention, always starts in state 0.
+pub struct DFA {
+ transition_table: BTreeMap<usize, BTreeMap<Range, usize>>,
+ accept_states: BTreeSet<usize>,
+}
+
+struct LabeledDFA<Label: Ord> {
+ start_state: Label,
+ transition_table: BTreeMap<Label, BTreeMap<Range, Label>>,
+ accept_states: BTreeSet<Label>,
+}
+
+impl From<NFA> for LabeledDFA<BTreeSet<usize>> {
+ fn from(nfa: NFA) -> Self {
+ type StateSet = BTreeSet<usize>;
+
+ let mut transition_table: BTreeMap<StateSet, BTreeMap<Range, StateSet>> = BTreeMap::new();
+
+ let mut queue: VecDeque<StateSet> = VecDeque::new();
+
+ let start_state = nfa.epsilon_closure(0);
+ queue.push_back(start_state.clone());
+
+ while let Some(state_set) = queue.pop_front() {
+ let state_set = nfa.epsilon_closure_all(state_set);
+
+ let mut next_table: BTreeMap<Range, StateSet> = BTreeMap::new();
+ for &state in &state_set {
+ let table_entry = nfa.transition_table.get(&state);
+ let non_epsilon_table_entries = table_entry
+ .iter()
+ .flat_map(|x| x.iter())
+ .filter_map(|(c, s)| c.as_ref().map(|c| (c.clone(), s)));
+ for (c, next_states) in non_epsilon_table_entries {
+ // TODO check for non-disjoint ranges
+ next_table
+ .entry(c)
+ .or_default()
+ .extend(nfa.epsilon_closure_all(next_states.iter().copied()))
+ }
+ }
+
+ for states in next_table.values() {
+ if !transition_table.contains_key(states) {
+ queue.push_back(states.clone());
+ }
+ }
+ transition_table.insert(state_set, next_table);
+ }
+
+ let accept_states = transition_table
+ .keys()
+ .chain(transition_table.values().flat_map(|v| v.values()))
+ .filter(|s| s.iter().any(|s| nfa.accept_states.contains(s)))
+ .cloned()
+ .collect();
+
+ dbg!(&accept_states);
+
+ Self {
+ start_state,
+ transition_table,
+ accept_states,
+ }
+ }
+}
+
+impl<Label: Ord> From<LabeledDFA<Label>> for DFA {
+ fn from(dfa: LabeledDFA<Label>) -> Self {
+ let mut state_names = BTreeMap::new();
+ state_names.insert(&dfa.start_state, 0usize);
+ let mut next_name = 1usize;
+
+ for reachable_state in dfa.transition_table.values().flat_map(|x| x.values()) {
+ state_names.entry(reachable_state).or_insert_with(|| {
+ let result = next_name;
+ next_name = next_name.saturating_add(1);
+ result
+ });
+ }
+
+ let transition_table = dfa
+ .transition_table
+ .iter()
+ .map(|(state, table)| {
+ (
+ state_names[&state],
+ table
+ .iter()
+ .map(|(c, s)| (c.clone(), state_names[&s]))
+ .collect(),
+ )
+ })
+ .collect();
+ let accept_states = state_names
+ .iter()
+ .filter(|(s, _)| dfa.accept_states.contains(s))
+ .map(|(_, n)| n)
+ .copied()
+ .collect();
+
+ Self {
+ transition_table,
+ accept_states,
+ }
+ }
+}
+
+impl From<NFA> for DFA {
+ fn from(a: NFA) -> Self {
+ let intermediate = LabeledDFA::from(a);
+ Self::from(intermediate)
+ }
+}
+
+impl ToTokens for Range {
+ fn to_tokens(&self, tokens: &mut TokenStream) {
+ let start = self.0.start();
+ let end = self.0.end();
+ let result = quote!(#start..=#end);
+ result.to_tokens(tokens);
+ }
+}
+
+impl ToTokens for DFA {
+ fn to_tokens(&self, tokens: &mut TokenStream) {
+ let transition = self
+ .transition_table
+ .iter()
+ .flat_map(|(current_state, table)| {
+ table
+ .iter()
+ .map(move |(ch, next_state)| quote!((#current_state, #ch) => #next_state,))
+ });
+ let accepts = self.accept_states.iter();
+ let result = quote! {
+ let mut state = 0usize;
+ for c in input.chars() {
+ state = match (state, c) {
+ #(#transition)*
+ _ => return false,
+ };
+ }
+ matches!(state, #(#accepts)|*)
+ };
+ result.to_tokens(tokens);
+ }
+}
diff --git a/bird-machine-macros/src/lib.rs b/bird-machine-macros/src/lib.rs
index 628ee91..c6df41c 100644
--- a/bird-machine-macros/src/lib.rs
+++ b/bird-machine-macros/src/lib.rs
@@ -1,9 +1,11 @@
extern crate proc_macro;
use proc_macro::TokenStream;
+use proc_macro2::Ident;
+use quote::{format_ident, quote, ToTokens};
use syn::{parse_macro_input, DeriveInput, LitStr};
-use quote::{quote, ToTokens};
+mod dfa;
mod nfa;
#[proc_macro_attribute]
@@ -14,13 +16,14 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream {
let input_type_name = &input.ident;
let input_lifetimes: Vec<_> = input.generics.lifetimes().collect();
let lifetime = match input_lifetimes.as_slice() {
- [] => quote!{ 'unrestricted },
- [lt] => quote!{ #lt },
+ [] => quote! { 'unrestricted },
+ [lt] => quote! { #lt },
_ => panic!("multiple lifetime generics, what is this, pls to halp"),
};
- let machine = build_machine(&input_regex);
- dbg!(&machine);
+ let machine_fn = format_ident!("bird_machine_for_{}", input_type_name);
+ let machine = build_machine(&input_regex, &machine_fn);
+ eprintln!("{}", &machine);
let (_, ty_generics, where_clause) = input.generics.split_for_impl();
@@ -59,7 +62,7 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream {
};
let is_match = quote! {
fn is_match(text: &#lifetime str) -> bool {
- todo!()
+ #machine_fn(text)
}
};
let is_match_at = quote! {
@@ -162,18 +165,21 @@ pub fn bird_machine(args: TokenStream, input: TokenStream) -> TokenStream {
tokens.into()
}
-fn build_machine(regex: &LitStr) -> proc_macro2::TokenStream {
+fn build_machine(regex: &LitStr, fn_name: &Ident) -> proc_macro2::TokenStream {
let regex_text = regex.value();
- let regex_ir = regex_syntax::Parser::new()
- .parse(&regex_text);
+ let regex_ir = regex_syntax::Parser::new().parse(&regex_text);
let regex_ir = match regex_ir {
Ok(x) => x,
- Err(err ) => panic!("error compiling regex {}: {}", regex.to_token_stream(), err),
+ Err(err) => panic!("error compiling regex {}: {}", regex.to_token_stream(), err),
};
- dbg!(&regex_ir);
- // shout out to all the professors who've taught me how to do this
- let mut built_nfa = nfa::NFA::default();
+ let built_nfa = nfa::NFA::from(&regex_ir);
+ dbg!("built the NFA");
+ let dfa = dfa::DFA::from(built_nfa);
- todo!()
+ quote! {
+ fn #fn_name(input: &str) -> bool {
+ #dfa
+ }
+ }
}
diff --git a/bird-machine-macros/src/nfa.rs b/bird-machine-macros/src/nfa.rs
index b7b3612..833353e 100644
--- a/bird-machine-macros/src/nfa.rs
+++ b/bird-machine-macros/src/nfa.rs
@@ -1,15 +1,178 @@
-use std::collections::{HashMap, HashSet};
+use std::cmp;
+use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::iter::repeat_with;
+use std::ops::RangeInclusive;
+use regex_syntax::hir::{Class, Hir, HirKind, Literal, RepetitionKind, RepetitionRange};
+
+#[derive(Eq, PartialEq, Clone)]
+pub struct Range(pub RangeInclusive<char>);
+
+impl PartialOrd for Range {
+ fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for Range {
+ fn cmp(&self, other: &Self) -> cmp::Ordering {
+ let start_order = self.0.start().cmp(other.0.start());
+ start_order.then_with(|| self.0.end().cmp(other.0.end()))
+ }
+}
+
+/// A nondeterministic finite automaton.
+/// By convention, always starts in state 0.
#[derive(Default)]
pub struct NFA {
- state_count: usize,
- transition_table: HashMap<Option<char>, HashSet<usize>>,
+ pub state_count: usize,
+ pub transition_table: BTreeMap<usize, BTreeMap<Option<Range>, BTreeSet<usize>>>,
+ pub accept_states: BTreeSet<usize>,
+}
+
+impl From<&Hir> for NFA {
+ fn from(hir: &Hir) -> Self {
+ match hir.kind() {
+ HirKind::Empty => {
+ // Conventionally, this matches only the empty string,
+ // but the regex crate has implicit `.*?` at the front and `.*`
+ // at the back of every regex.
+ // TODO figure out what to do with that
+ let mut result = NFA::default();
+ result.state_count = 1;
+ result.accept_states.insert(0);
+ result
+ }
+ HirKind::Literal(literal) => match literal {
+ Literal::Unicode(ch) => {
+ let ch = *ch;
+ let mut result = NFA::default();
+ result.state_count = 2;
+ result.accept_states.insert(1);
+ let range = Range(ch..=ch);
+ result
+ .transition_table
+ .entry(0)
+ .or_default()
+ .entry(Some(range))
+ .or_default()
+ .insert(1);
+ result
+ }
+ l => todo!("{:?}", l),
+ },
+ HirKind::Class(class) => match class {
+ Class::Unicode(class) => {
+ let mut result = NFA::default();
+ result.state_count = 2;
+ result.accept_states.insert(1);
+ for range in class.iter() {
+ let range = Range(range.start()..=range.end());
+ result
+ .transition_table
+ .entry(0)
+ .or_default()
+ .entry(Some(range))
+ .or_default()
+ .insert(1);
+ }
+ result
+ }
+ c => todo!("{:?}", c),
+ },
+ HirKind::Anchor(_) => {
+ // TODO actually fucking implement this
+ let mut result = NFA::default();
+ result.state_count = 1;
+ result.accept_states.insert(0);
+ result
+ }
+ HirKind::Repetition(rep) => match &rep.kind {
+ RepetitionKind::Range(range) => match range {
+ RepetitionRange::Exactly(n) => repeat_with(|| NFA::from(rep.hir.as_ref()))
+ .take(*n as usize)
+ .reduce(NFA::concat)
+ .expect("exactly 0 of something"),
+ r => todo!("{:?}", r),
+ },
+ k => todo!("{:?}", k),
+ },
+ HirKind::Concat(elements) => elements
+ .iter()
+ .map(NFA::from)
+ .reduce(NFA::concat)
+ .expect("empty HirKind::Concat"),
+ k => todo!("{:?}", k),
+ }
+ }
}
impl NFA {
- pub fn new_state(&mut self) -> usize {
- let result = self.state_count;
- self.state_count += 1;
+ fn concat(mut self, other: NFA) -> NFA {
+ // absorb the other automaton's states
+ let other_state_offset = self.state_count;
+ self.state_count += other.state_count;
+ self.transition_table
+ .extend(
+ other
+ .transition_table
+ .into_iter()
+ .map(|(state, table_for_state)| {
+ (
+ state + other_state_offset,
+ table_for_state
+ .into_iter()
+ .map(|(char_range, next_states)| {
+ (
+ char_range,
+ next_states
+ .into_iter()
+ .map(|s| s + other_state_offset)
+ .collect(),
+ )
+ })
+ .collect(),
+ )
+ }),
+ );
+ // for all the accept states of this machine...
+ for &self_accept in &self.accept_states {
+ // add an ε-transition from that state to the start of the other machine
+ self.transition_table
+ .entry(self_accept)
+ .or_default()
+ .entry(None)
+ .or_default()
+ .insert(other_state_offset);
+ }
+ // steal the other machine's accept states
+ self.accept_states = other.accept_states;
+ self
+ }
+
+ pub fn epsilon_closure(&self, state: usize) -> BTreeSet<usize> {
+ let mut set = BTreeSet::new();
+ set.insert(state);
+ self.epsilon_closure_all(set)
+ }
+
+ pub fn epsilon_closure_all(&self, states: impl IntoIterator<Item = usize>) -> BTreeSet<usize> {
+ let mut result = BTreeSet::new();
+ let mut queue: VecDeque<usize> = VecDeque::new();
+ queue.extend(states);
+ while let Some(next_state) = queue.pop_front() {
+ result.insert(next_state);
+ let states_after_that = self
+ .transition_table
+ .get(&next_state)
+ .and_then(|state_table| state_table.get(&None))
+ .map(|next_states| next_states.iter())
+ .into_iter()
+ .flatten()
+ .copied()
+ .filter(|s| !result.contains(s));
+ queue.extend(states_after_that);
+ }
result
}
}