sui_sql_macro/
parser.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5
6use crate::lexer::Lexeme;
7use crate::lexer::Token;
8
9/// Recursive descent parser for format strings. This struct is intended to be lightweight to
10/// support efficient backtracking. Backtracking is implemented by operating on a copy of the
11/// parser's state, and only updating the original when the copy has reached a known good position.
12#[derive(Copy, Clone)]
13pub(crate) struct Parser<'l, 's> {
14    lexemes: &'l [Lexeme<'s>],
15}
16
17/// Structured representation of a format string, starting with a text prefix (the head), followed
18/// by interleaved binders followed by text suffixes (the tail).
19#[derive(Debug, PartialEq, Eq)]
20pub(crate) struct Format<'s> {
21    pub head: Cow<'s, str>,
22    pub tail: Vec<(Option<syn::Type>, Cow<'s, str>)>,
23}
24
25#[derive(thiserror::Error, Debug)]
26pub(crate) enum Error {
27    #[error("Unexpected {actual} at offset {offset}, expected {expect}")]
28    Unexpected {
29        offset: usize,
30        actual: Token,
31        expect: Token,
32    },
33
34    #[error("Unexpected end of format string, expected: {expect:?}")]
35    UnexpectedEos { expect: Token },
36
37    #[error("Error parsing type for binder at offset {offset}: {source}")]
38    TypeParse { offset: usize, source: syn::Error },
39}
40
41/// Recursive descent parser recognizing the following grammar:
42///
43///   format        ::= text? (bind text?)*
44///   text          ::= part +
45///   part          ::= lcurly_escape | rcurly_escape | TEXT
46///   bind          ::= '{' TEXT? '}'
47///   lcurly_escape ::= '{' '{'
48///   rcurly_escape ::= '}' '}'
49///
50/// The parser reads tokens from the front, consuming them when it is able to successfully parse a
51/// non-terminal. When parsing fails, the parser may be left in an indeterminate state.
52/// Backtracking requires explicitly copying the parser state.
53impl<'l, 's> Parser<'l, 's> {
54    /// Constructs a new parser instance that will consume the given `lexemes`.
55    pub(crate) fn new(lexemes: &'l [Lexeme<'s>]) -> Self {
56        Self { lexemes }
57    }
58
59    /// Entrypoint to the parser. Consumes the entire remaining output (and therefore also the
60    /// parser).
61    pub(crate) fn format(mut self) -> Result<Format<'s>, Error> {
62        let head = self.text_opt();
63
64        let mut tail = vec![];
65        while let Some(bind) = self.bind()? {
66            let suffix = self.text_opt();
67            tail.push((bind, suffix));
68        }
69
70        Ok(Format { head, tail })
71    }
72
73    /// Parse a strand of text. The parser is left in its initial state and an empty string is
74    /// returned if there was no text to parse.
75    fn text_opt(&mut self) -> Cow<'s, str> {
76        let mut copy = *self;
77        copy.text().map_or(Cow::Borrowed(""), |t| {
78            *self = copy;
79            t
80        })
81    }
82
83    /// Parse a strand of text by gathering together as many `part`s as possible. Errors if there
84    /// is not at least one `part` to consume.
85    fn text(&mut self) -> Result<Cow<'s, str>, Error> {
86        let mut text = self.part()?;
87
88        let mut copy = *self;
89        while let Ok(part) = copy.part() {
90            text += part;
91            *self = copy;
92        }
93
94        Ok(text)
95    }
96
97    /// Parses any of the three possible `part`s of a text strand: a string of text, or an escaped
98    /// curly brace. Errors if the token stream starts with a binder.
99    fn part(&mut self) -> Result<Cow<'s, str>, Error> {
100        let mut copy = *self;
101        if let Ok(s) = copy.lcurly_escape() {
102            *self = copy;
103            return Ok(s);
104        }
105
106        let mut copy = *self;
107        if let Ok(s) = copy.rcurly_escape() {
108            *self = copy;
109            return Ok(s);
110        }
111
112        let Lexeme(_, _, text) = self.eat(Token::Text)?;
113        Ok(Cow::Borrowed(text))
114    }
115
116    /// Parses as an escaped left curly brace (two curly brace tokens).
117    fn lcurly_escape(&mut self) -> Result<Cow<'s, str>, Error> {
118        use Token as T;
119        self.eat(T::LCurl)?;
120        self.eat(T::LCurl)?;
121        Ok(Cow::Borrowed("{"))
122    }
123
124    /// Parses as an escaped right curly brace (two curly brace tokens).
125    fn rcurly_escape(&mut self) -> Result<Cow<'s, str>, Error> {
126        use Token as T;
127        self.eat(T::RCurl)?;
128        self.eat(T::RCurl)?;
129        Ok(Cow::Borrowed("}"))
130    }
131
132    /// Parses a binding (curly braces optionally containing a type).
133    ///
134    /// Returns `Ok(None)` if there are no tokens left, `Ok(Some(None))` for a binding with no
135    /// type, `Ok(Some(Some(type)))` for a binding with a type, or an error if the binding is
136    /// malformed.
137    fn bind(&mut self) -> Result<Option<Option<syn::Type>>, Error> {
138        if self.lexemes.is_empty() {
139            return Ok(None);
140        }
141
142        self.eat(Token::LCurl)?;
143        if self.peek(Token::RCurl).is_ok() {
144            self.eat(Token::RCurl)?;
145            return Ok(Some(None));
146        }
147
148        let Lexeme(_, offset, type_) = self.eat(Token::Text)?;
149        self.eat(Token::RCurl)?;
150
151        let type_ = syn::parse_str(type_).map_err(|source| Error::TypeParse { offset, source })?;
152        Ok(Some(Some(type_)))
153    }
154
155    /// Consume the next token (returning it) as long as it matches `expect`.
156    fn eat(&mut self, expect: Token) -> Result<Lexeme<'s>, Error> {
157        match self.peek(expect) {
158            Ok(l) => {
159                self.lexemes = &self.lexemes[1..];
160                Ok(l)
161            }
162            Err(Some(l)) => Err(Error::Unexpected {
163                offset: l.1,
164                actual: l.0,
165                expect,
166            }),
167            Err(None) => Err(Error::UnexpectedEos { expect }),
168        }
169    }
170
171    /// Look at the next token and check that it matches `expect` without consuming it. Returns
172    /// the Ok of the lexeme if its token matches, otherwise it returns an error that either
173    /// contains the lexeme that did not match or None if there are no tokens left.
174    fn peek(&self, expect: Token) -> Result<Lexeme<'s>, Option<Lexeme<'s>>> {
175        match self.lexemes.first() {
176            Some(l) if l.0 == expect => Ok(*l),
177            Some(l) => Err(Some(*l)),
178            None => Err(None),
179        }
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use crate::Lexer;
186
187    use super::*;
188
189    /// Test helper for lexing and parsing a format string.
190    fn parse(s: &str) -> Result<Format<'_>, Error> {
191        let lexemes: Vec<_> = Lexer::new(s).collect();
192        Parser::new(&lexemes).format()
193    }
194
195    /// Parse a Rust type from a string, to compare against binds in tests.
196    fn type_(s: &str) -> Option<syn::Type> {
197        Some(syn::parse_str(s).unwrap())
198    }
199
200    #[test]
201    fn test_no_binds() {
202        // A simple format string with no binders will gather everything into the head.
203        assert_eq!(
204            parse("foo").unwrap(),
205            Format {
206                head: "foo".into(),
207                tail: vec![]
208            }
209        );
210    }
211
212    #[test]
213    fn test_single_bind() {
214        // A binder is parsed without its surrounding binders as a type, and splits the format
215        // string in two.
216        assert_eq!(
217            parse("foo = {Text} AND bar = 42").unwrap(),
218            Format {
219                head: "foo = ".into(),
220                tail: vec![(type_("Text"), " AND bar = 42".into())]
221            },
222        );
223    }
224
225    #[test]
226    fn test_multiple_binds() {
227        // When there are multiple binders the parser needs to detect the gap between binders.
228        assert_eq!(
229            parse("foo = {Text} AND (bar < {BigInt} OR bar > 5)").unwrap(),
230            Format {
231                head: "foo = ".into(),
232                tail: vec![
233                    (type_("Text"), " AND (bar < ".into()),
234                    (type_("BigInt"), " OR bar > 5)".into()),
235                ],
236            },
237        );
238    }
239
240    #[test]
241    fn test_ends_with_a_bind() {
242        // If the format string ends with a binder, the parser still needs to find an empty suffix
243        // binder.
244        assert_eq!(
245            parse("bar BETWEEN {BigInt} AND {BigInt}").unwrap(),
246            Format {
247                head: "bar BETWEEN ".into(),
248                tail: vec![
249                    (type_("BigInt"), " AND ".into()),
250                    (type_("BigInt"), "".into()),
251                ],
252            },
253        );
254    }
255
256    #[test]
257    fn test_escaped_curlies() {
258        // Escaped curlies are de-duplicated in the parsed output, but the parser does not break up
259        // strands of format string on them.
260        assert_eq!(
261            parse("foo LIKE '{{bar%'").unwrap(),
262            Format {
263                head: "foo LIKE '{bar%'".into(),
264                tail: vec![],
265            },
266        );
267    }
268
269    #[test]
270    fn test_curly_nest() {
271        // This input can be tricky to parse if the lexer treats escaped curlies as a single token.
272        assert_eq!(
273            parse("{{{Bool}}}").unwrap(),
274            Format {
275                head: "{".into(),
276                tail: vec![(type_("Bool"), "}".into())],
277            },
278        );
279    }
280
281    #[test]
282    fn test_bind_unexpected_token() {
283        // Error if the binder is not properly closed.
284        assert!(matches!(
285            parse("{Bool{").unwrap_err(),
286            Error::Unexpected {
287                offset: 5,
288                actual: Token::LCurl,
289                expect: Token::RCurl
290            },
291        ));
292    }
293
294    #[test]
295    fn test_bind_no_type() {
296        // Empty binders are supported.
297        assert_eq!(
298            parse("foo = {}").unwrap(),
299            Format {
300                head: "foo = ".into(),
301                tail: vec![(None, "".into())],
302            }
303        );
304    }
305
306    #[test]
307    fn test_bind_unfinished() {
308        // Error if the binder does not contain a type.
309        assert!(matches!(
310            parse("foo = {").unwrap_err(),
311            Error::UnexpectedEos {
312                expect: Token::Text,
313            },
314        ));
315    }
316
317    #[test]
318    fn test_bind_no_rcurly() {
319        // Error if the binder ends before it is closed.
320        assert!(matches!(
321            parse("foo = {Text").unwrap_err(),
322            Error::UnexpectedEos {
323                expect: Token::RCurl,
324            },
325        ));
326    }
327
328    #[test]
329    fn test_bind_bad_type() {
330        // Failure to parse the binder as a Rust type is also an error for this parser.
331        assert!(matches!(
332            parse("foo = {not a type}").unwrap_err(),
333            Error::TypeParse { offset: 7, .. },
334        ));
335    }
336}