aboutsummaryrefslogtreecommitdiff
path: root/src/tokens.rs
diff options
context:
space:
mode:
authorAlex Crichton <alex@alexcrichton.com>2017-01-29 16:53:20 -0800
committerAlex Crichton <alex@alexcrichton.com>2017-02-08 21:21:18 -0800
commitf66d8bcf33530c858a502bfa170f2383a8cbc204 (patch)
tree76498b837fc5f1f6ba0a5f53e1b2d85c6638da4d /src/tokens.rs
parent473908c9722eeedeec1777237a135f582faa78d8 (diff)
downloadmilf-rs-f66d8bcf33530c858a502bfa170f2383a8cbc204.tar.gz
milf-rs-f66d8bcf33530c858a502bfa170f2383a8cbc204.zip
Rewrite crate with serde support from ground up
This commit completely rewrites this crate from the ground up, supporting serde at the lowest levels as I believe serde support was intended to do. This is a major change from the previous versions of this crate, with a summary of changes being: * Serialization directly to TOML is now supported without going through a `Value` first. * Deserialization directly from TOML is now supported without going through a `Value`. Note that due to the TOML format some values still are buffered in intermediate memory, but overall this should be at a minimum now. * The API of `Value` was overhauled to match the API of `serde_json::Value`. The changes here were to: * Add `is_*` accessors * Add `get` and `get_mut` for one-field lookups. * Implement panicking lookups through `Index` The old `index` methods are now gone in favor of `get` and `Index` implementations. * A `Datetime` type has been added to represent a TOML datetime in a first-class fashion. Currently this type provides no accessors other than a `Display` implementation, but the idea is that this will grow support over time for decomposing the date. * Support for the `rustc-serialize` crate has been dropped, that'll stay on the 0.2 and 0.1 release trains. * This crate no longer supports the detection of unused fields, for that though you can use the `serde_ignored` crate on crates.io
Diffstat (limited to 'src/tokens.rs')
-rw-r--r--src/tokens.rs620
1 files changed, 620 insertions, 0 deletions
diff --git a/src/tokens.rs b/src/tokens.rs
new file mode 100644
index 0000000..26c8277
--- /dev/null
+++ b/src/tokens.rs
@@ -0,0 +1,620 @@
+use std::borrow::Cow;
+use std::char;
+use std::str;
+use std::string;
+
+use self::Token::*;
+
+#[derive(Eq, PartialEq, Debug)]
+pub enum Token<'a> {
+ Whitespace(&'a str),
+ Newline,
+ Comment(&'a str),
+
+ Equals,
+ Period,
+ Comma,
+ Colon,
+ Plus,
+ LeftBrace,
+ RightBrace,
+ LeftBracket,
+ RightBracket,
+
+ Keylike(&'a str),
+ String { src: &'a str, val: Cow<'a, str> },
+}
+
+#[derive(Eq, PartialEq, Debug)]
+pub enum Error {
+ InvalidCharInString(usize, char),
+ InvalidEscape(usize, char),
+ InvalidHexEscape(usize, char),
+ InvalidEscapeValue(usize, u32),
+ NewlineInString(usize),
+ Unexpected(usize, char),
+ UnterminatedString(usize),
+ NewlineInTableKey(usize),
+ EmptyTableKey(usize),
+ Wanted { at: usize, expected: &'static str, found: &'static str },
+}
+
+#[derive(Clone)]
+pub struct Tokenizer<'a> {
+ input: &'a str,
+ chars: CrlfFold<'a>,
+}
+
+#[derive(Clone)]
+struct CrlfFold<'a> {
+ chars: str::CharIndices<'a>,
+}
+
+#[derive(Debug)]
+enum MaybeString {
+ NotEscaped(usize),
+ Owned(string::String),
+}
+
+impl<'a> Tokenizer<'a> {
+ pub fn new(input: &'a str) -> Tokenizer<'a> {
+ let mut t = Tokenizer {
+ input: input,
+ chars: CrlfFold {
+ chars: input.char_indices(),
+ },
+ };
+ // Eat utf-8 BOM
+ t.eatc('\u{feff}');
+ return t
+ }
+
+ pub fn next(&mut self) -> Result<Option<Token<'a>>, Error> {
+ let token = match self.chars.next() {
+ Some((_, '\n')) => Newline,
+ Some((start, ' ')) => self.whitespace_token(start),
+ Some((start, '\t')) => self.whitespace_token(start),
+ Some((start, '#')) => self.comment_token(start),
+ Some((_, '=')) => Equals,
+ Some((_, '.')) => Period,
+ Some((_, ',')) => Comma,
+ Some((_, ':')) => Colon,
+ Some((_, '+')) => Plus,
+ Some((_, '{')) => LeftBrace,
+ Some((_, '}')) => RightBrace,
+ Some((_, '[')) => LeftBracket,
+ Some((_, ']')) => RightBracket,
+ Some((start, '\'')) => return self.literal_string(start).map(Some),
+ Some((start, '"')) => return self.basic_string(start).map(Some),
+ Some((start, ch)) if is_keylike(ch) => self.keylike(start),
+
+ Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
+ None => return Ok(None),
+ };
+ Ok(Some(token))
+ }
+
+ pub fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
+ self.clone().next()
+ }
+
+ pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
+ match self.peek()? {
+ Some(ref found) if expected == *found => {}
+ Some(_) => return Ok(false),
+ None => return Ok(false),
+ }
+ drop(self.next());
+ Ok(true)
+ }
+
+ pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
+ let current = self.current();
+ match self.next()? {
+ Some(found) => {
+ if expected == found {
+ Ok(())
+ } else {
+ Err(Error::Wanted {
+ at: current,
+ expected: expected.describe(),
+ found: found.describe(),
+ })
+ }
+ }
+ None => {
+ Err(Error::Wanted {
+ at: self.input.len(),
+ expected: expected.describe(),
+ found: "eof",
+ })
+ }
+ }
+ }
+
+ pub fn table_key(&mut self) -> Result<Cow<'a, str>, Error> {
+ let current = self.current();
+ match self.next()? {
+ Some(Token::Keylike(k)) => Ok(k.into()),
+ Some(Token::String { src, val }) => {
+ let offset = self.substr_offset(src);
+ if val == "" {
+ return Err(Error::EmptyTableKey(offset))
+ }
+ match src.find("\n") {
+ None => Ok(val),
+ Some(i) => Err(Error::NewlineInTableKey(offset + i)),
+ }
+ }
+ Some(other) => {
+ Err(Error::Wanted {
+ at: current,
+ expected: "a table key",
+ found: other.describe(),
+ })
+ }
+ None => {
+ Err(Error::Wanted {
+ at: self.input.len(),
+ expected: "a table key",
+ found: "eof",
+ })
+ }
+ }
+ }
+
+ pub fn eat_whitespace(&mut self) -> Result<(), Error> {
+ while self.eatc(' ') || self.eatc('\t') {
+ // ...
+ }
+ Ok(())
+ }
+
+ pub fn eat_comment(&mut self) -> Result<bool, Error> {
+ if !self.eatc('#') {
+ return Ok(false)
+ }
+ drop(self.comment_token(0));
+ self.eat_newline_or_eof().map(|()| true)
+ }
+
+ pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
+ let current = self.current();
+ match self.next()? {
+ None |
+ Some(Token::Newline) => Ok(()),
+ Some(other) => {
+ Err(Error::Wanted {
+ at: current,
+ expected: "newline",
+ found: other.describe(),
+ })
+ }
+ }
+ }
+
+ pub fn skip_to_newline(&mut self) {
+ loop {
+ match self.chars.next() {
+ Some((_, '\n')) |
+ None => break,
+ _ => {}
+ }
+ }
+ }
+
+ fn eatc(&mut self, ch: char) -> bool {
+ match self.chars.clone().next() {
+ Some((_, ch2)) if ch == ch2 => {
+ self.chars.next();
+ true
+ }
+ _ => false,
+ }
+ }
+
+ pub fn current(&mut self) -> usize {
+ self.chars.clone().next().map(|i| i.0).unwrap_or(self.input.len())
+ }
+
+ pub fn input(&self) -> &'a str {
+ self.input
+ }
+
+ fn whitespace_token(&mut self, start: usize) -> Token<'a> {
+ while self.eatc(' ') || self.eatc('\t') {
+ // ...
+ }
+ Whitespace(&self.input[start..self.current()])
+ }
+
+ fn comment_token(&mut self, start: usize) -> Token<'a> {
+ while let Some((_, ch)) = self.chars.clone().next() {
+ if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') {
+ break
+ }
+ self.chars.next();
+ }
+ Comment(&self.input[start..self.current()])
+ }
+
+ fn read_string(&mut self,
+ delim: char,
+ start: usize,
+ new_ch: &mut FnMut(&mut Tokenizer, &mut MaybeString,
+ bool, usize, char)
+ -> Result<(), Error>)
+ -> Result<Token<'a>, Error> {
+ let mut multiline = false;
+ if self.eatc(delim) {
+ if self.eatc(delim) {
+ multiline = true;
+ } else {
+ return Ok(String {
+ src: &self.input[start..start+2],
+ val: Cow::Borrowed(""),
+ })
+ }
+ }
+ let mut val = MaybeString::NotEscaped(self.current());
+ let mut n = 0;
+ 'outer: loop {
+ n += 1;
+ match self.chars.next() {
+ Some((i, '\n')) => {
+ if multiline {
+ if self.input.as_bytes()[i] == b'\r' {
+ val.to_owned(&self.input[..i]);
+ }
+ if n == 1 {
+ val = MaybeString::NotEscaped(self.current());
+ } else {
+ val.push('\n');
+ }
+ continue
+ } else {
+ return Err(Error::NewlineInString(i))
+ }
+ }
+ Some((i, ch)) if ch == delim => {
+ if multiline {
+ for _ in 0..2 {
+ if !self.eatc(delim) {
+ val.push(delim);
+ continue 'outer
+ }
+ }
+ }
+ return Ok(String {
+ src: &self.input[start..self.current()],
+ val: val.into_cow(&self.input[..i]),
+ })
+ }
+ Some((i, c)) => try!(new_ch(self, &mut val, multiline, i, c)),
+ None => return Err(Error::UnterminatedString(start))
+ }
+ }
+ }
+
+ fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
+ self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
+ if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}') {
+ val.push(ch);
+ Ok(())
+ } else {
+ Err(Error::InvalidCharInString(i, ch))
+ }
+ })
+ }
+
+ fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> {
+ self.read_string('"', start, &mut |me, val, multi, i, ch| {
+ match ch {
+ '\\' => {
+ val.to_owned(&me.input[..i]);
+ match me.chars.next() {
+ Some((_, '"')) => val.push('"'),
+ Some((_, '\\')) => val.push('\\'),
+ Some((_, 'b')) => val.push('\u{8}'),
+ Some((_, 'f')) => val.push('\u{c}'),
+ Some((_, 'n')) => val.push('\n'),
+ Some((_, 'r')) => val.push('\r'),
+ Some((_, 't')) => val.push('\t'),
+ Some((i, c @ 'u')) |
+ Some((i, c @ 'U')) => {
+ let len = if c == 'u' {4} else {8};
+ val.push(try!(me.hex(start, i, len)));
+ }
+ Some((_, '\n')) if multi => {
+ while let Some((_, ch)) = me.chars.clone().next() {
+ match ch {
+ ' ' | '\t' | '\n' => {
+ me.chars.next();
+ }
+ _ => break,
+ }
+ }
+ }
+ Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
+ None => return Err(Error::UnterminatedString(start)),
+ }
+ Ok(())
+ }
+ ch if '\u{20}' <= ch && ch <= '\u{10ffff}' => {
+ val.push(ch);
+ Ok(())
+ }
+ _ => Err(Error::InvalidCharInString(i, ch))
+ }
+ })
+ }
+
+ fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> {
+ let mut val = 0;
+ for _ in 0..len {
+ match self.chars.next() {
+ Some((_, ch)) if '0' <= ch && ch <= '9' => {
+ val = val * 16 + (ch as u32 - '0' as u32);
+ }
+ Some((_, ch)) if 'A' <= ch && ch <= 'F' => {
+ val = val * 16 + (ch as u32 - 'A' as u32) + 10;
+ }
+ Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
+ None => return Err(Error::UnterminatedString(start)),
+ }
+ }
+ match char::from_u32(val) {
+ Some(ch) => Ok(ch),
+ None => Err(Error::InvalidEscapeValue(i, val)),
+ }
+ }
+
+ fn keylike(&mut self, start: usize) -> Token<'a> {
+ while let Some((_, ch)) = self.chars.clone().next() {
+ if !is_keylike(ch) {
+ break
+ }
+ self.chars.next();
+ }
+ Keylike(&self.input[start..self.current()])
+ }
+
+ pub fn substr_offset(&self, s: &'a str) -> usize {
+ assert!(s.len() < self.input.len());
+ let a = self.input.as_ptr() as usize;
+ let b = s.as_ptr() as usize;
+ assert!(a <= b);
+ b - a
+ }
+}
+
+impl<'a> Iterator for CrlfFold<'a> {
+ type Item = (usize, char);
+
+ fn next(&mut self) -> Option<(usize, char)> {
+ self.chars.next().map(|(i, c)| {
+ if c == '\r' {
+ let mut attempt = self.chars.clone();
+ if let Some((_, '\n')) = attempt.next() {
+ self.chars = attempt;
+ return (i, '\n')
+ }
+ }
+ (i, c)
+ })
+ }
+}
+
+impl MaybeString {
+ fn push(&mut self, ch: char) {
+ match *self {
+ MaybeString::NotEscaped(..) => {}
+ MaybeString::Owned(ref mut s) => s.push(ch),
+ }
+ }
+
+ fn to_owned(&mut self, input: &str) {
+ match *self {
+ MaybeString::NotEscaped(start) => {
+ *self = MaybeString::Owned(input[start..].to_owned());
+ }
+ MaybeString::Owned(..) => {}
+ }
+ }
+
+ fn into_cow<'a>(self, input: &'a str) -> Cow<'a, str> {
+ match self {
+ MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
+ MaybeString::Owned(s) => Cow::Owned(s),
+ }
+ }
+}
+
+fn is_keylike(ch: char) -> bool {
+ ('A' <= ch && ch <= 'Z') ||
+ ('a' <= ch && ch <= 'z') ||
+ ('0' <= ch && ch <= '9') ||
+ ch == '-' ||
+ ch == '_'
+}
+
+impl<'a> Token<'a> {
+ pub fn describe(&self) -> &'static str {
+ match *self {
+ Token::Keylike(_) => "an identifier",
+ Token::Equals => "an equals",
+ Token::Period => "a period",
+ Token::Comment(_) => "a comment",
+ Token::Newline => "a newline",
+ Token::Whitespace(_) => "whitespace",
+ Token::Comma => "a comma",
+ Token::RightBrace => "a right brace",
+ Token::LeftBrace => "a left brace",
+ Token::RightBracket => "a right bracket",
+ Token::LeftBracket => "a left bracket",
+ Token::String { .. } => "a string",
+ Token::Colon => "a colon",
+ Token::Plus => "a plus",
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::borrow::Cow;
+ use super::{Tokenizer, Token, Error};
+
+ fn err(input: &str, err: Error) {
+ let mut t = Tokenizer::new(input);
+ let token = t.next().unwrap_err();
+ assert_eq!(token, err);
+ assert!(t.next().unwrap().is_none());
+ }
+
+ #[test]
+ fn literal_strings() {
+ fn t(input: &str, val: &str) {
+ let mut t = Tokenizer::new(input);
+ let token = t.next().unwrap().unwrap();
+ assert_eq!(token, Token::String {
+ src: input,
+ val: Cow::Borrowed(val),
+ });
+ assert!(t.next().unwrap().is_none());
+ }
+
+ t("''", "");
+ t("''''''", "");
+ t("'''\n'''", "");
+ t("'a'", "a");
+ t("'\"a'", "\"a");
+ t("''''a'''", "'a");
+ t("'''\n'a\n'''", "'a\n");
+ t("'''a\n'a\r\n'''", "a\n'a\n");
+ }
+
+ #[test]
+ fn basic_strings() {
+ fn t(input: &str, val: &str) {
+ let mut t = Tokenizer::new(input);
+ let token = t.next().unwrap().unwrap();
+ assert_eq!(token, Token::String {
+ src: input,
+ val: Cow::Borrowed(val),
+ });
+ assert!(t.next().unwrap().is_none());
+ }
+
+ t(r#""""#, "");
+ t(r#""""""""#, "");
+ t(r#""a""#, "a");
+ t(r#""""a""""#, "a");
+ t(r#""\t""#, "\t");
+ t(r#""\u0000""#, "\0");
+ t(r#""\U00000000""#, "\0");
+ t(r#""\U000A0000""#, "\u{A0000}");
+ t(r#""\\t""#, "\\t");
+ t("\"\"\"\\\n\"\"\"", "");
+ t("\"\"\"\\\n \t \t \\\r\n \t \n \t \r\n\"\"\"", "");
+ t(r#""\r""#, "\r");
+ t(r#""\n""#, "\n");
+ t(r#""\b""#, "\u{8}");
+ t(r#""a\fa""#, "a\u{c}a");
+ t(r#""\"a""#, "\"a");
+ t("\"\"\"\na\"\"\"", "a");
+ t("\"\"\"\n\"\"\"", "");
+ err(r#""\a"#, Error::InvalidEscape(2, 'a'));
+ err("\"\\\n", Error::InvalidEscape(2, '\n'));
+ err("\"\\\r\n", Error::InvalidEscape(2, '\n'));
+ err("\"\\", Error::UnterminatedString(0));
+ err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}'));
+ err(r#""\U00""#, Error::InvalidHexEscape(5, '"'));
+ err(r#""\U00"#, Error::UnterminatedString(0));
+ err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800));
+ err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffffffff));
+ }
+
+ #[test]
+ fn keylike() {
+ fn t(input: &str) {
+ let mut t = Tokenizer::new(input);
+ let token = t.next().unwrap().unwrap();
+ assert_eq!(token, Token::Keylike(input));
+ assert!(t.next().unwrap().is_none());
+ }
+ t("foo");
+ t("0bar");
+ t("bar0");
+ t("1234");
+ t("a-b");
+ t("a_B");
+ t("-_-");
+ t("___");
+ }
+
+ #[test]
+ fn all() {
+ fn t(input: &str, expected: &[Token]) {
+ let mut tokens = Tokenizer::new(input);
+ let mut actual = Vec::new();
+ while let Some(token) = tokens.next().unwrap() {
+ actual.push(token);
+ }
+ for (a, b) in actual.iter().zip(expected) {
+ assert_eq!(a, b);
+ }
+ assert_eq!(actual.len(), expected.len());
+ }
+
+ t(" a ", &[
+ Token::Whitespace(" "),
+ Token::Keylike("a"),
+ Token::Whitespace(" "),
+ ]);
+
+ t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[
+ Token::Whitespace(" "),
+ Token::Keylike("a"),
+ Token::Whitespace("\t "),
+ Token::LeftBracket,
+ Token::LeftBracket,
+ Token::RightBracket,
+ Token::RightBracket,
+ Token::Whitespace(" \t "),
+ Token::LeftBracket,
+ Token::RightBracket,
+ Token::Whitespace(" "),
+ Token::LeftBrace,
+ Token::RightBrace,
+ Token::Whitespace(" "),
+ Token::Comma,
+ Token::Whitespace(" "),
+ Token::Period,
+ Token::Whitespace(" "),
+ Token::Equals,
+ Token::Newline,
+ Token::Comment("# foo "),
+ Token::Newline,
+ Token::Comment("#foo "),
+ Token::Newline,
+ Token::Whitespace(" "),
+ ]);
+ }
+
+ #[test]
+ fn bare_cr_bad() {
+ err("\r", Error::Unexpected(0, '\r'));
+ err("'\n", Error::NewlineInString(1));
+ err("'\u{0}", Error::InvalidCharInString(1, '\u{0}'));
+ err("'", Error::UnterminatedString(0));
+ err("\u{0}", Error::Unexpected(0, '\u{0}'));
+ }
+
+ #[test]
+ fn bad_comment() {
+ let mut t = Tokenizer::new("#\u{0}");
+ t.next().unwrap().unwrap();
+ assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}')));
+ assert!(t.next().unwrap().is_none());
+ }
+}