sui_sql_macro/
parser.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5
6use crate::lexer::{Lexeme, Token};
7
8/// Recursive descent parser for format strings. This struct is intended to be lightweight to
9/// support efficient backtracking. Backtracking is implemented by operating on a copy of the
10/// parser's state, and only updating the original when the copy has reached a known good position.
11#[derive(Copy, Clone)]
12pub(crate) struct Parser<'l, 's> {
13    lexemes: &'l [Lexeme<'s>],
14}
15
16/// Structured representation of a format string, starting with a text prefix (the head), followed
17/// by interleaved binders followed by text suffixes (the tail).
18#[derive(Debug, PartialEq, Eq)]
19pub(crate) struct Format<'s> {
20    pub head: Cow<'s, str>,
21    pub tail: Vec<(Option<syn::Type>, Cow<'s, str>)>,
22}
23
24#[derive(thiserror::Error, Debug)]
25pub(crate) enum Error {
26    #[error("Unexpected {actual} at offset {offset}, expected {expect}")]
27    Unexpected {
28        offset: usize,
29        actual: Token,
30        expect: Token,
31    },
32
33    #[error("Unexpected end of format string, expected: {expect:?}")]
34    UnexpectedEos { expect: Token },
35
36    #[error("Error parsing type for binder at offset {offset}: {source}")]
37    TypeParse { offset: usize, source: syn::Error },
38}
39
40/// Recursive descent parser recognizing the following grammar:
41///
42///   format        ::= text? (bind text?)*
43///   text          ::= part +
44///   part          ::= lcurly_escape | rcurly_escape | TEXT
45///   bind          ::= '{' TEXT? '}'
46///   lcurly_escape ::= '{' '{'
47///   rcurly_escape ::= '}' '}'
48///
49/// The parser reads tokens from the front, consuming them when it is able to successfully parse a
50/// non-terminal. When parsing fails, the parser may be left in an indeterminate state.
51/// Backtracking requires explicitly copying the parser state.
52impl<'l, 's> Parser<'l, 's> {
53    /// Constructs a new parser instance that will consume the given `lexemes`.
54    pub(crate) fn new(lexemes: &'l [Lexeme<'s>]) -> Self {
55        Self { lexemes }
56    }
57
58    /// Entrypoint to the parser. Consumes the entire remaining output (and therefore also the
59    /// parser).
60    pub(crate) fn format(mut self) -> Result<Format<'s>, Error> {
61        let head = self.text_opt();
62
63        let mut tail = vec![];
64        while let Some(bind) = self.bind()? {
65            let suffix = self.text_opt();
66            tail.push((bind, suffix));
67        }
68
69        Ok(Format { head, tail })
70    }
71
72    /// Parse a strand of text. The parser is left in its initial state and an empty string is
73    /// returned if there was no text to parse.
74    fn text_opt(&mut self) -> Cow<'s, str> {
75        let mut copy = *self;
76        copy.text().map_or(Cow::Borrowed(""), |t| {
77            *self = copy;
78            t
79        })
80    }
81
82    /// Parse a strand of text by gathering together as many `part`s as possible. Errors if there
83    /// is not at least one `part` to consume.
84    fn text(&mut self) -> Result<Cow<'s, str>, Error> {
85        let mut text = self.part()?;
86
87        let mut copy = *self;
88        while let Ok(part) = copy.part() {
89            text += part;
90            *self = copy;
91        }
92
93        Ok(text)
94    }
95
96    /// Parses any of the three possible `part`s of a text strand: a string of text, or an escaped
97    /// curly brace. Errors if the token stream starts with a binder.
98    fn part(&mut self) -> Result<Cow<'s, str>, Error> {
99        let mut copy = *self;
100        if let Ok(s) = copy.lcurly_escape() {
101            *self = copy;
102            return Ok(s);
103        }
104
105        let mut copy = *self;
106        if let Ok(s) = copy.rcurly_escape() {
107            *self = copy;
108            return Ok(s);
109        }
110
111        let Lexeme(_, _, text) = self.eat(Token::Text)?;
112        Ok(Cow::Borrowed(text))
113    }
114
115    /// Parses as an escaped left curly brace (two curly brace tokens).
116    fn lcurly_escape(&mut self) -> Result<Cow<'s, str>, Error> {
117        use Token as T;
118        self.eat(T::LCurl)?;
119        self.eat(T::LCurl)?;
120        Ok(Cow::Borrowed("{"))
121    }
122
123    /// Parses as an escaped right curly brace (two curly brace tokens).
124    fn rcurly_escape(&mut self) -> Result<Cow<'s, str>, Error> {
125        use Token as T;
126        self.eat(T::RCurl)?;
127        self.eat(T::RCurl)?;
128        Ok(Cow::Borrowed("}"))
129    }
130
131    /// Parses a binding (curly braces optionally containing a type).
132    ///
133    /// Returns `Ok(None)` if there are no tokens left, `Ok(Some(None))` for a binding with no
134    /// type, `Ok(Some(Some(type)))` for a binding with a type, or an error if the binding is
135    /// malformed.
136    fn bind(&mut self) -> Result<Option<Option<syn::Type>>, Error> {
137        if self.lexemes.is_empty() {
138            return Ok(None);
139        }
140
141        self.eat(Token::LCurl)?;
142        if self.peek(Token::RCurl).is_ok() {
143            self.eat(Token::RCurl)?;
144            return Ok(Some(None));
145        }
146
147        let Lexeme(_, offset, type_) = self.eat(Token::Text)?;
148        self.eat(Token::RCurl)?;
149
150        let type_ = syn::parse_str(type_).map_err(|source| Error::TypeParse { offset, source })?;
151        Ok(Some(Some(type_)))
152    }
153
154    /// Consume the next token (returning it) as long as it matches `expect`.
155    fn eat(&mut self, expect: Token) -> Result<Lexeme<'s>, Error> {
156        match self.peek(expect) {
157            Ok(l) => {
158                self.lexemes = &self.lexemes[1..];
159                Ok(l)
160            }
161            Err(Some(l)) => Err(Error::Unexpected {
162                offset: l.1,
163                actual: l.0,
164                expect,
165            }),
166            Err(None) => Err(Error::UnexpectedEos { expect }),
167        }
168    }
169
170    /// Look at the next token and check that it matches `expect` without consuming it. Returns
171    /// the Ok of the lexeme if its token matches, otherwise it returns an error that either
172    /// contains the lexeme that did not match or None if there are no tokens left.
173    fn peek(&self, expect: Token) -> Result<Lexeme<'s>, Option<Lexeme<'s>>> {
174        match self.lexemes.first() {
175            Some(l) if l.0 == expect => Ok(*l),
176            Some(l) => Err(Some(*l)),
177            None => Err(None),
178        }
179    }
180}
181
182#[cfg(test)]
183mod tests {
184    use crate::Lexer;
185
186    use super::*;
187
188    /// Test helper for lexing and parsing a format string.
189    fn parse(s: &str) -> Result<Format<'_>, Error> {
190        let lexemes: Vec<_> = Lexer::new(s).collect();
191        Parser::new(&lexemes).format()
192    }
193
194    /// Parse a Rust type from a string, to compare against binds in tests.
195    fn type_(s: &str) -> Option<syn::Type> {
196        Some(syn::parse_str(s).unwrap())
197    }
198
199    #[test]
200    fn test_no_binds() {
201        // A simple format string with no binders will gather everything into the head.
202        assert_eq!(
203            parse("foo").unwrap(),
204            Format {
205                head: "foo".into(),
206                tail: vec![]
207            }
208        );
209    }
210
211    #[test]
212    fn test_single_bind() {
213        // A binder is parsed without its surrounding binders as a type, and splits the format
214        // string in two.
215        assert_eq!(
216            parse("foo = {Text} AND bar = 42").unwrap(),
217            Format {
218                head: "foo = ".into(),
219                tail: vec![(type_("Text"), " AND bar = 42".into())]
220            },
221        );
222    }
223
224    #[test]
225    fn test_multiple_binds() {
226        // When there are multiple binders the parser needs to detect the gap between binders.
227        assert_eq!(
228            parse("foo = {Text} AND (bar < {BigInt} OR bar > 5)").unwrap(),
229            Format {
230                head: "foo = ".into(),
231                tail: vec![
232                    (type_("Text"), " AND (bar < ".into()),
233                    (type_("BigInt"), " OR bar > 5)".into()),
234                ],
235            },
236        );
237    }
238
239    #[test]
240    fn test_ends_with_a_bind() {
241        // If the format string ends with a binder, the parser still needs to find an empty suffix
242        // binder.
243        assert_eq!(
244            parse("bar BETWEEN {BigInt} AND {BigInt}").unwrap(),
245            Format {
246                head: "bar BETWEEN ".into(),
247                tail: vec![
248                    (type_("BigInt"), " AND ".into()),
249                    (type_("BigInt"), "".into()),
250                ],
251            },
252        );
253    }
254
255    #[test]
256    fn test_escaped_curlies() {
257        // Escaped curlies are de-duplicated in the parsed output, but the parser does not break up
258        // strands of format string on them.
259        assert_eq!(
260            parse("foo LIKE '{{bar%'").unwrap(),
261            Format {
262                head: "foo LIKE '{bar%'".into(),
263                tail: vec![],
264            },
265        );
266    }
267
268    #[test]
269    fn test_curly_nest() {
270        // This input can be tricky to parse if the lexer treats escaped curlies as a single token.
271        assert_eq!(
272            parse("{{{Bool}}}").unwrap(),
273            Format {
274                head: "{".into(),
275                tail: vec![(type_("Bool"), "}".into())],
276            },
277        );
278    }
279
280    #[test]
281    fn test_bind_unexpected_token() {
282        // Error if the binder is not properly closed.
283        assert!(matches!(
284            parse("{Bool{").unwrap_err(),
285            Error::Unexpected {
286                offset: 5,
287                actual: Token::LCurl,
288                expect: Token::RCurl
289            },
290        ));
291    }
292
293    #[test]
294    fn test_bind_no_type() {
295        // Empty binders are supported.
296        assert_eq!(
297            parse("foo = {}").unwrap(),
298            Format {
299                head: "foo = ".into(),
300                tail: vec![(None, "".into())],
301            }
302        );
303    }
304
305    #[test]
306    fn test_bind_unfinished() {
307        // Error if the binder does not contain a type.
308        assert!(matches!(
309            parse("foo = {").unwrap_err(),
310            Error::UnexpectedEos {
311                expect: Token::Text,
312            },
313        ));
314    }
315
316    #[test]
317    fn test_bind_no_rcurly() {
318        // Error if the binder ends before it is closed.
319        assert!(matches!(
320            parse("foo = {Text").unwrap_err(),
321            Error::UnexpectedEos {
322                expect: Token::RCurl,
323            },
324        ));
325    }
326
327    #[test]
328    fn test_bind_bad_type() {
329        // Failure to parse the binder as a Rust type is also an error for this parser.
330        assert!(matches!(
331            parse("foo = {not a type}").unwrap_err(),
332            Error::TypeParse { offset: 7, .. },
333        ));
334    }
335}