sui_display/v2/
lexer.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5use std::fmt;
6
7/// Lexer for Display V2 format strings. Format strings are a mix of text and expressions.
8/// Expressions are enclosed in braces and may contain multiple alternates, separated by pipes, and
9/// each containing nested field, vector, or dynamic field accesses.
10#[derive(Debug)]
11pub(crate) struct Lexer<'s> {
12    /// Remaining input to be tokenized.
13    src: &'s str,
14
15    /// The number of bytes (not characters) tokenized so far.
16    off: usize,
17
18    /// Nesting of curly braces. At level 0, the lexer is in text mode. At all other levels, it is
19    /// in expression mode.
20    level: usize,
21}
22
23/// A lexeme is a slice of the source string marked with a token. The `bool` field indicates
24/// whether the lexeme was preceded by whitespace or not.
25#[derive(Copy, Clone, Debug, PartialEq, Eq)]
26pub(crate) struct Lexeme<'s>(pub bool, pub Token, pub usize, pub &'s str);
27
28/// Like `Lexeme` but owns the slice of source string. Useful for capturing context in an error
29/// message.
30#[derive(Debug, Clone)]
31pub struct OwnedLexeme(
32    pub(crate) bool,
33    pub(crate) Token,
34    pub(crate) usize,
35    pub(crate) String,
36);
37
38#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
39pub(crate) enum Token {
40    /// '->'
41    Arrow,
42    /// '=>'
43    AArrow,
44    /// '@'
45    At,
46    /// ':'
47    Colon,
48    /// '::'
49    CColon,
50    /// ','
51    Comma,
52    /// '.'
53    Dot,
54    /// An identifier
55    Ident,
56    /// '<'
57    LAngle,
58    /// '{'
59    LBrace,
60    /// '['
61    LBracket,
62    /// '{{'
63    LLBrace,
64    /// '('
65    LParen,
66    /// A decimal number, optionally separated by underscores.
67    NumDec,
68    /// A hexadecimal number, prefixed with '0x' (not included in the span), optionally separated
69    /// by underscores.
70    NumHex,
71    /// '|'
72    Pipe,
73    /// '#'
74    Pound,
75    /// '>'
76    RAngle,
77    /// '}'
78    RBrace,
79    /// ']'
80    RBracket,
81    /// ')'
82    RParen,
83    /// '}}'
84    RRBrace,
85    /// Strings are surrounded by single quotes. Quotes and backslashes inside strings are escaped
86    /// with backslashes.
87    String,
88    /// A strand of text.
89    Text,
90
91    /// An unexpected byte in the input string.
92    Unexpected,
93}
94
95impl<'s> Lexer<'s> {
96    pub(crate) fn new(src: &'s str) -> Self {
97        Self {
98            src,
99            off: 0,
100            level: 0,
101        }
102    }
103
104    /// Assuming the lexer is in text mode, return the next text token.
105    fn next_text_token(&mut self) -> Option<Lexeme<'s>> {
106        let bytes = self.src.as_bytes();
107
108        use Token as T;
109        Some(match bytes.first()? {
110            b'{' if bytes.get(1) == Some(&b'{') => {
111                self.advance(1);
112                self.take(false, T::LLBrace, 1)
113            }
114
115            b'{' => {
116                self.level += 1;
117                self.take(false, T::LBrace, 1)
118            }
119
120            b'}' if bytes.get(1) == Some(&b'}') => {
121                self.advance(1);
122                self.take(false, T::RRBrace, 1)
123            }
124
125            // This is not a valid token within text, but is recognised so that the parser can
126            // produce a better error message. `level` is not decremenetd because we should already
127            // been in text mode, meaning the level is already 0, and a decrement would underflow
128            // it.
129            b'}' => self.take(false, T::RBrace, 1),
130
131            _ => self.take_until(false, T::Text, |b| b"{}".contains(&b)),
132        })
133    }
134
135    /// Assuming the lexer is in expression mode, return the next expression token.
136    fn next_expr_token(&mut self) -> Option<Lexeme<'s>> {
137        let ws = self.take_whitespace();
138        let bytes = self.src.as_bytes();
139
140        use Token as T;
141        Some(match bytes.first()? {
142            b'-' if bytes.get(1) == Some(&b'>') => self.take(ws, T::Arrow, 2),
143
144            b'=' if bytes.get(1) == Some(&b'>') => self.take(ws, T::AArrow, 2),
145
146            b'@' => self.take(ws, T::At, 1),
147
148            b':' if bytes.get(1) == Some(&b':') => self.take(ws, T::CColon, 2),
149
150            b':' => self.take(ws, T::Colon, 1),
151
152            b',' => self.take(ws, T::Comma, 1),
153
154            b'.' => self.take(ws, T::Dot, 1),
155
156            b'0' if bytes.get(1) == Some(&b'x')
157                && bytes.get(2).is_some_and(|b| is_valid_hex_byte(*b)) =>
158            {
159                self.advance(2);
160                self.take_until(ws, T::NumHex, |c| !is_valid_hex_byte(c))
161            }
162
163            b'0'..=b'9' => self.take_until(ws, T::NumDec, |c| !is_valid_decimal_byte(c)),
164
165            b'a'..=b'z' | b'A'..=b'Z' | b'_' => {
166                self.take_until(ws, T::Ident, |c| !is_valid_identifier_byte(c))
167            }
168
169            b'<' => self.take(ws, T::LAngle, 1),
170
171            b'{' => {
172                self.level += 1;
173                self.take(ws, T::LBrace, 1)
174            }
175
176            b'[' => self.take(ws, T::LBracket, 1),
177
178            b'(' => self.take(ws, T::LParen, 1),
179
180            b'|' => self.take(ws, T::Pipe, 1),
181
182            b'#' => self.take(ws, T::Pound, 1),
183
184            b'>' => self.take(ws, T::RAngle, 1),
185
186            b'}' => {
187                self.level -= 1;
188                self.take(ws, T::RBrace, 1)
189            }
190
191            b']' => self.take(ws, T::RBracket, 1),
192
193            b')' => self.take(ws, T::RParen, 1),
194
195            b'\'' => {
196                // Set the escaped indicator to true initially so we don't interpret the starting
197                // quote as an ending quote.
198                let mut escaped = true;
199                for (i, b) in self.src.bytes().enumerate() {
200                    if escaped {
201                        escaped = false;
202                    } else if b == b'\\' {
203                        escaped = true;
204                    } else if b == b'\'' {
205                        self.advance(1);
206                        let content = self.take(ws, T::String, i - 1);
207                        self.advance(1);
208                        return Some(content);
209                    }
210                }
211
212                // Reached the end of the byte stream and didn't find a closing quote -- treat the
213                // partial string as an unexpected token.
214                self.take(ws, T::Unexpected, self.src.len())
215            }
216
217            // If the next byte cannot be recognized, extract the next (potentially variable
218            // length) character, and indicate that it is an unexpected token.
219            _ => {
220                let next_boundary = (1..=self.src.len())
221                    .find(|&i| self.src.is_char_boundary(i))
222                    .unwrap_or(self.src.len());
223                self.take(ws, T::Unexpected, next_boundary)
224            }
225        })
226    }
227
228    /// Eat ASCII whitespace from the beginning of `self.src`. Returns `true` if and only if any
229    /// whitespace was consumed.
230    fn take_whitespace(&mut self) -> bool {
231        let Lexeme(_, _, _, slice) =
232            self.take_until(false, Token::Unexpected, |b| !b.is_ascii_whitespace());
233        !slice.is_empty()
234    }
235
236    /// Take a prefix of bytes from `self.src` until a byte satisfying pattern `p` is found, and
237    /// return it as a lexeme of type `t`. If no such byte is found, take the entire remainder of
238    /// the source string.
239    fn take_until(&mut self, ws: bool, t: Token, p: impl FnMut(u8) -> bool) -> Lexeme<'s> {
240        let n = self.src.bytes().position(p).unwrap_or(self.src.len());
241        self.take(ws, t, n)
242    }
243
244    /// Take `n` bytes from the beginning of `self.src` and return them as a lexeme of type `t`.
245    ///
246    /// ## Safety
247    ///
248    /// This function assumes that there are at least `n` bytes left in `self.src`, and will panic
249    /// if that is not the case.
250    fn take(&mut self, ws: bool, t: Token, n: usize) -> Lexeme<'s> {
251        let start = self.off;
252        let slice = &self.src[..n];
253        self.advance(n);
254
255        Lexeme(ws, t, start, slice)
256    }
257
258    /// Move the cursor forward by `n` bytes.
259    ///
260    /// ## Safety
261    ///
262    /// This function assumes that `n` is less than or equal to the length of `self.src`, and will
263    /// panic if that is not the case.
264    fn advance(&mut self, n: usize) {
265        self.src = &self.src[n..];
266        self.off += n;
267    }
268}
269
270impl Lexeme<'_> {
271    /// Return the lexeme as an owned lexeme, with the slice of source string copied.
272    pub(crate) fn detach(&self) -> OwnedLexeme {
273        OwnedLexeme(self.0, self.1, self.2, self.3.to_owned())
274    }
275}
276
277impl<'s> Iterator for Lexer<'s> {
278    type Item = Lexeme<'s>;
279
280    fn next(&mut self) -> Option<Self::Item> {
281        if self.level == 0 {
282            self.next_text_token()
283        } else {
284            self.next_expr_token()
285        }
286    }
287}
288
289impl fmt::Display for OwnedLexeme {
290    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
291        use OwnedLexeme as L;
292        use Token as T;
293
294        if self.0 {
295            write!(f, "whitespace followed by ")?;
296        }
297
298        match self {
299            L(_, T::Arrow, _, _) => write!(f, "'->'"),
300            L(_, T::AArrow, _, _) => write!(f, "'=>'"),
301            L(_, T::At, _, _) => write!(f, "'@'"),
302            L(_, T::Colon, _, _) => write!(f, "':'"),
303            L(_, T::CColon, _, _) => write!(f, "'::'"),
304            L(_, T::Comma, _, _) => write!(f, "','"),
305            L(_, T::Dot, _, _) => write!(f, "'.'"),
306            L(_, T::Ident, _, s) => write!(f, "identifier {s:?}"),
307            L(_, T::LAngle, _, _) => write!(f, "'<'"),
308            L(_, T::LBrace, _, _) => write!(f, "'{{'"),
309            L(_, T::LBracket, _, _) => write!(f, "'['"),
310            L(_, T::LLBrace, _, _) => write!(f, "'{{{{'"),
311            L(_, T::LParen, _, _) => write!(f, "'('"),
312            L(_, T::NumDec, _, s) => write!(f, "decimal number {s:?}"),
313            L(_, T::NumHex, _, s) => write!(f, "hexadecimal number {s:?}"),
314            L(_, T::Pipe, _, _) => write!(f, "'|'"),
315            L(_, T::Pound, _, _) => write!(f, "'#'"),
316            L(_, T::RAngle, _, _) => write!(f, "'>'"),
317            L(_, T::RBrace, _, _) => write!(f, "'}}'"),
318            L(_, T::RBracket, _, _) => write!(f, "']'"),
319            L(_, T::RParen, _, _) => write!(f, "')'"),
320            L(_, T::RRBrace, _, _) => write!(f, "'}}}}'"),
321            L(_, T::String, _, s) => write!(f, "string {s:?}"),
322            L(_, T::Text, _, s) => write!(f, "text {s:?}"),
323            L(_, T::Unexpected, _, s) => {
324                write!(f, "\"")?;
325                for b in s.bytes() {
326                    match b {
327                        b'"' => write!(f, "\\\"")?,
328                        b'\\' => write!(f, "\\\\")?,
329                        b'\n' => write!(f, "\\n")?,
330                        b'\t' => write!(f, "\\t")?,
331                        b'\r' => write!(f, "\\r")?,
332                        b if b.is_ascii_graphic() || b == b' ' => write!(f, "{}", b as char)?,
333                        b => write!(f, "\\x{:02X}", b)?,
334                    }
335                }
336                write!(f, "\"")
337            }
338        }?;
339
340        write!(f, " at offset {}", self.2)
341    }
342}
343
344impl fmt::Display for Token {
345    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
346        use Token as T;
347        match self {
348            T::Arrow => write!(f, "'->'"),
349            T::AArrow => write!(f, "'=>'"),
350            T::At => write!(f, "'@'"),
351            T::Colon => write!(f, "':'"),
352            T::CColon => write!(f, "'::'"),
353            T::Comma => write!(f, "','"),
354            T::Dot => write!(f, "'.'"),
355            T::Ident => write!(f, "an identifier"),
356            T::LAngle => write!(f, "'<'"),
357            T::LBrace => write!(f, "'{{'"),
358            T::LBracket => write!(f, "'['"),
359            T::LLBrace => write!(f, "'{{{{'"),
360            T::LParen => write!(f, "'('"),
361            T::NumDec => write!(f, "a decimal number"),
362            T::NumHex => write!(f, "a hexadecimal number"),
363            T::Pipe => write!(f, "'|'"),
364            T::Pound => write!(f, "'#'"),
365            T::RAngle => write!(f, "'>'"),
366            T::RBrace => write!(f, "'}}'"),
367            T::RBracket => write!(f, "']'"),
368            T::RParen => write!(f, "')'"),
369            T::RRBrace => write!(f, "'}}}}'"),
370            T::String => write!(f, "a string"),
371            T::Text => write!(f, "text"),
372            T::Unexpected => write!(f, "unexpected input"),
373        }
374    }
375}
376
377fn is_valid_identifier_byte(b: u8) -> bool {
378    matches!(b, b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')
379}
380
381fn is_valid_hex_byte(b: u8) -> bool {
382    matches!(b, b'_' | b'a'..=b'f' | b'A'..=b'F' | b'0'..=b'9')
383}
384
385fn is_valid_decimal_byte(b: u8) -> bool {
386    matches!(b, b'_' | b'0'..=b'9')
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392    use Lexeme as L;
393    use insta::assert_snapshot;
394
395    fn lexemes(src: &str) -> String {
396        Lexer::new(src)
397            .map(|L(ws, t, o, s)| {
398                // Handle potentially invalid UTF-8 by working at byte level
399                let safe_s: String = s
400                    .bytes()
401                    .map(|b| match b {
402                        b'"' => "\\\"".to_string(),
403                        b'\\' => "\\\\".to_string(),
404                        b'\n' => "\\n".to_string(),
405                        b'\t' => "\\t".to_string(),
406                        b'\r' => "\\r".to_string(),
407                        b if b.is_ascii_graphic() || b == b' ' => (b as char).to_string(),
408                        b => format!("\\x{:02X}", b),
409                    })
410                    .collect();
411                format!("L({ws:?}, {t:?}, {o:?}, \"{}\")", safe_s)
412            })
413            .collect::<Vec<_>>()
414            .join("\n")
415    }
416
417    /// Simple test for a  raw literal string.
418    #[test]
419    fn test_all_text() {
420        assert_snapshot!(lexemes("foo bar"), @r###"L(false, Text, 0, "foo bar")"###);
421    }
422
423    /// Escape sequences are all text, but they will be split into multiple tokens.
424    #[test]
425    fn test_escapes() {
426        assert_snapshot!(lexemes(r#"foo {{bar}}"#), @r###"
427        L(false, Text, 0, "foo ")
428        L(false, LLBrace, 5, "{")
429        L(false, Text, 6, "bar")
430        L(false, RRBrace, 10, "}")
431        "###);
432    }
433
434    /// Text inside braces is tokenized as if it's an expression.
435    #[test]
436    fn test_expressions() {
437        assert_snapshot!(lexemes(r#"foo {bar}"#), @r###"
438        L(false, Text, 0, "foo ")
439        L(false, LBrace, 4, "{")
440        L(false, Ident, 5, "bar")
441        L(false, RBrace, 8, "}")
442        "###);
443    }
444
445    /// Expressions are tokenized to ignore whitespace.
446    #[test]
447    fn test_expression_whitespace() {
448        assert_snapshot!(lexemes(r#"foo {  bar   }"#), @r###"
449        L(false, Text, 0, "foo ")
450        L(false, LBrace, 4, "{")
451        L(true, Ident, 7, "bar")
452        L(true, RBrace, 13, "}")
453        "###);
454    }
455
456    /// Field names are separated by dots in an expression.
457    #[test]
458    fn test_expression_dots() {
459        assert_snapshot!(lexemes(r#"foo {bar. baz  . qux}"#), @r###"
460        L(false, Text, 0, "foo ")
461        L(false, LBrace, 4, "{")
462        L(false, Ident, 5, "bar")
463        L(false, Dot, 8, ".")
464        L(true, Ident, 10, "baz")
465        L(true, Dot, 15, ".")
466        L(true, Ident, 17, "qux")
467        L(false, RBrace, 20, "}")
468        "###);
469    }
470
471    /// Multiple expressions test switching and back and forth between lexer modes.
472    #[test]
473    fn test_multiple_expressions() {
474        assert_snapshot!(lexemes(r#"foo {bar.baz} qux {quy.quz}"#), @r###"
475        L(false, Text, 0, "foo ")
476        L(false, LBrace, 4, "{")
477        L(false, Ident, 5, "bar")
478        L(false, Dot, 8, ".")
479        L(false, Ident, 9, "baz")
480        L(false, RBrace, 12, "}")
481        L(false, Text, 13, " qux ")
482        L(false, LBrace, 18, "{")
483        L(false, Ident, 19, "quy")
484        L(false, Dot, 22, ".")
485        L(false, Ident, 23, "quz")
486        L(false, RBrace, 26, "}")
487        "###);
488    }
489
490    /// Expressions can include nested curly braces. Meeting the first well-bracketed closing curly
491    /// brace should not cause the lexer to exit expression mode.
492    #[test]
493    fn test_nested_curlies() {
494        assert_snapshot!(lexemes(r#"foo {bar {baz} qux}"#), @r###"
495        L(false, Text, 0, "foo ")
496        L(false, LBrace, 4, "{")
497        L(false, Ident, 5, "bar")
498        L(true, LBrace, 9, "{")
499        L(false, Ident, 10, "baz")
500        L(false, RBrace, 13, "}")
501        L(true, Ident, 15, "qux")
502        L(false, RBrace, 18, "}")
503        "###);
504    }
505
506    /// The lexer will still tokenize curlies even if they are not balanced.
507    #[test]
508    fn test_unbalanced_curlies() {
509        assert_snapshot!(lexemes(r#"foo}{bar{}}"#), @r###"
510        L(false, Text, 0, "foo")
511        L(false, RBrace, 3, "}")
512        L(false, LBrace, 4, "{")
513        L(false, Ident, 5, "bar")
514        L(false, LBrace, 8, "{")
515        L(false, RBrace, 9, "}")
516        L(false, RBrace, 10, "}")
517        "###);
518    }
519
520    /// Unexpected characters are tokenized so that the parser can produce an error.
521    #[test]
522    fn test_unexpected_characters() {
523        assert_snapshot!(lexemes(r#"anything goes {? % ! 🔥}"#), @r###"
524        L(false, Text, 0, "anything goes ")
525        L(false, LBrace, 14, "{")
526        L(false, Unexpected, 15, "?")
527        L(true, Unexpected, 17, "%")
528        L(true, Unexpected, 19, "!")
529        L(true, Unexpected, 21, "\xF0\x9F\x94\xA5")
530        L(false, RBrace, 25, "}")
531        "###);
532    }
533
534    // Escaped curlies shouldn't be tokenized greedily. '{{{' in text mode should be tokenized as
535    // '{{' and '{', while '}}}' in expr mode should be tokenized as '}' and '}}'. This test
536    // exercises these and similar cases.
537    #[test]
538    fn test_triple_curlies() {
539        assert_snapshot!(lexemes(r#"foo {{{bar} {baz}}} }}} { {{ } qux"#), @r###"
540        L(false, Text, 0, "foo ")
541        L(false, LLBrace, 5, "{")
542        L(false, LBrace, 6, "{")
543        L(false, Ident, 7, "bar")
544        L(false, RBrace, 10, "}")
545        L(false, Text, 11, " ")
546        L(false, LBrace, 12, "{")
547        L(false, Ident, 13, "baz")
548        L(false, RBrace, 16, "}")
549        L(false, RRBrace, 18, "}")
550        L(false, Text, 19, " ")
551        L(false, RRBrace, 21, "}")
552        L(false, RBrace, 22, "}")
553        L(false, Text, 23, " ")
554        L(false, LBrace, 24, "{")
555        L(true, LBrace, 26, "{")
556        L(false, LBrace, 27, "{")
557        L(true, RBrace, 29, "}")
558        L(true, Ident, 31, "qux")
559        "###);
560    }
561
562    /// Pipes separate top-level expressions, but are only parsed inside expressions, not inside
563    /// text.
564    #[test]
565    fn test_alternates() {
566        assert_snapshot!(lexemes(r#"foo | {bar | baz.qux} | quy"#), @r###"
567        L(false, Text, 0, "foo | ")
568        L(false, LBrace, 6, "{")
569        L(false, Ident, 7, "bar")
570        L(true, Pipe, 11, "|")
571        L(true, Ident, 13, "baz")
572        L(false, Dot, 16, ".")
573        L(false, Ident, 17, "qux")
574        L(false, RBrace, 20, "}")
575        L(false, Text, 21, " | quy")
576        "###);
577    }
578
579    // Display supports three kinds of index -- `foo[i]`, `bar->[j]`, and `baz=>[k]`, representing
580    // vector/VecMap, dynamic field, and dynamic object field access respectively.
581    #[test]
582    fn test_indices() {
583        assert_snapshot!(lexemes(r#"foo {bar[baz].qux=>[quy]->[quz]}"#), @r###"
584        L(false, Text, 0, "foo ")
585        L(false, LBrace, 4, "{")
586        L(false, Ident, 5, "bar")
587        L(false, LBracket, 8, "[")
588        L(false, Ident, 9, "baz")
589        L(false, RBracket, 12, "]")
590        L(false, Dot, 13, ".")
591        L(false, Ident, 14, "qux")
592        L(false, AArrow, 17, "=>")
593        L(false, LBracket, 19, "[")
594        L(false, Ident, 20, "quy")
595        L(false, RBracket, 23, "]")
596        L(false, Arrow, 24, "->")
597        L(false, LBracket, 26, "[")
598        L(false, Ident, 27, "quz")
599        L(false, RBracket, 30, "]")
600        L(false, RBrace, 31, "}")
601        "###);
602    }
603
604    /// Numbers can be represented in decimal or hexadecimal (prefixed with 0x).
605    #[test]
606    fn test_numeric_literals() {
607        assert_snapshot!(lexemes(r#"{123 0x123 def 0xdef}"#), @r###"
608        L(false, LBrace, 0, "{")
609        L(false, NumDec, 1, "123")
610        L(true, NumHex, 7, "123")
611        L(true, Ident, 11, "def")
612        L(true, NumHex, 17, "def")
613        L(false, RBrace, 20, "}")
614        "###);
615    }
616
617    /// Numbers can optionally be grouped using underscores. Underscores cannot be trailing, but
618    /// otherwise can appear in every position. Leading underscores will cause the numeric literal
619    /// to be interpreted as an identifier, not a number.
620    #[test]
621    fn test_numeric_literal_underscores() {
622        assert_snapshot!(lexemes(r#"{123_456 0x12_ab_de _123}"#), @r###"
623        L(false, LBrace, 0, "{")
624        L(false, NumDec, 1, "123_456")
625        L(true, NumHex, 11, "12_ab_de")
626        L(true, Ident, 20, "_123")
627        L(false, RBrace, 24, "}")
628        "###);
629    }
630
631    /// Address literals are numbers prefixed with '@' -- typically, they are hexadecimal numbers
632    /// but both kinds are supported.
633    #[test]
634    fn test_address_literals() {
635        assert_snapshot!(lexemes(r#"{@123 @0x123}"#), @r###"
636        L(false, LBrace, 0, "{")
637        L(false, At, 1, "@")
638        L(false, NumDec, 2, "123")
639        L(true, At, 6, "@")
640        L(false, NumHex, 9, "123")
641        L(false, RBrace, 12, "}")
642        "###);
643    }
644
645    /// If the hexadecimal token is incomplete, it is not recognised as a number.
646    #[test]
647    fn test_incomplete_hexadecimal() {
648        assert_snapshot!(lexemes(r#"{0x}"#), @r###"
649        L(false, LBrace, 0, "{")
650        L(false, NumDec, 1, "0")
651        L(false, Ident, 2, "x")
652        L(false, RBrace, 3, "}")
653        "###);
654    }
655
656    /// Vector literals are always prefixed by the 'vector' keyword. Empty vectors must specify a
657    /// type parameter (which is optional for non-empty vectors).
658    #[test]
659    fn test_vector_literals() {
660        assert_snapshot!(lexemes(r#"{vector[1, 2, 3] vector<u32> vector[4u64]}"#), @r###"
661        L(false, LBrace, 0, "{")
662        L(false, Ident, 1, "vector")
663        L(false, LBracket, 7, "[")
664        L(false, NumDec, 8, "1")
665        L(false, Comma, 9, ",")
666        L(true, NumDec, 11, "2")
667        L(false, Comma, 12, ",")
668        L(true, NumDec, 14, "3")
669        L(false, RBracket, 15, "]")
670        L(true, Ident, 17, "vector")
671        L(false, LAngle, 23, "<")
672        L(false, Ident, 24, "u32")
673        L(false, RAngle, 27, ">")
674        L(true, Ident, 29, "vector")
675        L(false, LBracket, 35, "[")
676        L(false, NumDec, 36, "4")
677        L(false, Ident, 37, "u64")
678        L(false, RBracket, 40, "]")
679        L(false, RBrace, 41, "}")
680        "###);
681    }
682
683    /// Struct types are fully-qualified, with a numerical (hexadecimal) address.
684    #[test]
685    fn test_types() {
686        assert_snapshot!(lexemes(r#"{0x2::table::Table<address, 0x2::coin::Coin<0x2::sui::SUI>>}"#), @r###"
687        L(false, LBrace, 0, "{")
688        L(false, NumHex, 3, "2")
689        L(false, CColon, 4, "::")
690        L(false, Ident, 6, "table")
691        L(false, CColon, 11, "::")
692        L(false, Ident, 13, "Table")
693        L(false, LAngle, 18, "<")
694        L(false, Ident, 19, "address")
695        L(false, Comma, 26, ",")
696        L(true, NumHex, 30, "2")
697        L(false, CColon, 31, "::")
698        L(false, Ident, 33, "coin")
699        L(false, CColon, 37, "::")
700        L(false, Ident, 39, "Coin")
701        L(false, LAngle, 43, "<")
702        L(false, NumHex, 46, "2")
703        L(false, CColon, 47, "::")
704        L(false, Ident, 49, "sui")
705        L(false, CColon, 52, "::")
706        L(false, Ident, 54, "SUI")
707        L(false, RAngle, 57, ">")
708        L(false, RAngle, 58, ">")
709        L(false, RBrace, 59, "}")
710        "###);
711    }
712
713    /// A positional struct literal is a struct type followed by its (positional) fields, separated
714    /// by commas, surrounded by parentheses.
715    #[test]
716    fn test_positional_struct_literals() {
717        assert_snapshot!(lexemes(r#"{0x2::balance::Balance<0x2::sui::SUI>(42u64)}"#), @r###"
718        L(false, LBrace, 0, "{")
719        L(false, NumHex, 3, "2")
720        L(false, CColon, 4, "::")
721        L(false, Ident, 6, "balance")
722        L(false, CColon, 13, "::")
723        L(false, Ident, 15, "Balance")
724        L(false, LAngle, 22, "<")
725        L(false, NumHex, 25, "2")
726        L(false, CColon, 26, "::")
727        L(false, Ident, 28, "sui")
728        L(false, CColon, 31, "::")
729        L(false, Ident, 33, "SUI")
730        L(false, RAngle, 36, ">")
731        L(false, LParen, 37, "(")
732        L(false, NumDec, 38, "42")
733        L(false, Ident, 40, "u64")
734        L(false, RParen, 43, ")")
735        L(false, RBrace, 44, "}")
736        "###);
737    }
738
739    /// Struct literals can also include field names -- these are purely informational, they don't
740    /// affect the encoded output.
741    #[test]
742    fn test_struct_literals() {
743        assert_snapshot!(lexemes(r#"{0x2::coin::Coin<0x2::sui::SUI> { id: @0x123, value: 42u64 }}"#), @r###"
744        L(false, LBrace, 0, "{")
745        L(false, NumHex, 3, "2")
746        L(false, CColon, 4, "::")
747        L(false, Ident, 6, "coin")
748        L(false, CColon, 10, "::")
749        L(false, Ident, 12, "Coin")
750        L(false, LAngle, 16, "<")
751        L(false, NumHex, 19, "2")
752        L(false, CColon, 20, "::")
753        L(false, Ident, 22, "sui")
754        L(false, CColon, 25, "::")
755        L(false, Ident, 27, "SUI")
756        L(false, RAngle, 30, ">")
757        L(true, LBrace, 32, "{")
758        L(true, Ident, 34, "id")
759        L(false, Colon, 36, ":")
760        L(true, At, 38, "@")
761        L(false, NumHex, 41, "123")
762        L(false, Comma, 44, ",")
763        L(true, Ident, 46, "value")
764        L(false, Colon, 51, ":")
765        L(true, NumDec, 53, "42")
766        L(false, Ident, 55, "u64")
767        L(true, RBrace, 59, "}")
768        L(false, RBrace, 60, "}")
769        "###);
770    }
771
772    /// Enums are like structs but with an additional variant component. The variant must at least
773    /// specify the variant index, and can optionally specify a variant name, which is only
774    /// relevant for documentation purposes (it does not affect the encoding).
775    #[test]
776    fn test_enum_literals() {
777        assert_snapshot!(lexemes(r#"{0x2::option::Option<u64>::1(42) 0x2::option::Option<u64>::Some#1(43)}"#), @r###"
778        L(false, LBrace, 0, "{")
779        L(false, NumHex, 3, "2")
780        L(false, CColon, 4, "::")
781        L(false, Ident, 6, "option")
782        L(false, CColon, 12, "::")
783        L(false, Ident, 14, "Option")
784        L(false, LAngle, 20, "<")
785        L(false, Ident, 21, "u64")
786        L(false, RAngle, 24, ">")
787        L(false, CColon, 25, "::")
788        L(false, NumDec, 27, "1")
789        L(false, LParen, 28, "(")
790        L(false, NumDec, 29, "42")
791        L(false, RParen, 31, ")")
792        L(true, NumHex, 35, "2")
793        L(false, CColon, 36, "::")
794        L(false, Ident, 38, "option")
795        L(false, CColon, 44, "::")
796        L(false, Ident, 46, "Option")
797        L(false, LAngle, 52, "<")
798        L(false, Ident, 53, "u64")
799        L(false, RAngle, 56, ">")
800        L(false, CColon, 57, "::")
801        L(false, Ident, 59, "Some")
802        L(false, Pound, 63, "#")
803        L(false, NumDec, 64, "1")
804        L(false, LParen, 65, "(")
805        L(false, NumDec, 66, "43")
806        L(false, RParen, 68, ")")
807        L(false, RBrace, 69, "}")
808        "###);
809    }
810
811    /// Tokenizing three kinds of string literals hex, binary, and regular.
812    #[test]
813    fn string_literals() {
814        assert_snapshot!(lexemes(r#"{x'0f00' b'bar' 'baz'}"#), @r###"
815        L(false, LBrace, 0, "{")
816        L(false, Ident, 1, "x")
817        L(false, String, 3, "0f00")
818        L(true, Ident, 9, "b")
819        L(false, String, 11, "bar")
820        L(true, String, 17, "baz")
821        L(false, RBrace, 21, "}")
822        "###);
823    }
824
825    /// Make sure the string does not stop early on an escaped quote, it's fine to escape random
826    /// characters, and an escaped backslash does not eat the closing quote.
827    #[test]
828    fn test_string_literal_escapes() {
829        assert_snapshot!(lexemes(r#"{'\' \x \\'}"#), @r###"
830        L(false, LBrace, 0, "{")
831        L(false, String, 2, "\\' \\x \\\\")
832        L(false, RBrace, 11, "}")
833        "###);
834    }
835
836    /// If the string literal is not closed, the whole sequence is treated as an "unexpected"
837    /// token.
838    #[test]
839    fn test_string_literal_trailing() {
840        assert_snapshot!(lexemes(r#"{'foo bar}"#), @r###"
841        L(false, LBrace, 0, "{")
842        L(false, Unexpected, 1, "'foo bar}")
843        "###);
844    }
845
846    /// Test handling of single-byte unexpected characters followed by valid tokens.
847    #[test]
848    fn test_unexpected_single_byte() {
849        assert_snapshot!(lexemes("{$hello}"), @r###"
850        L(false, LBrace, 0, "{")
851        L(false, Unexpected, 1, "$")
852        L(false, Ident, 2, "hello")
853        L(false, RBrace, 7, "}")
854        "###);
855    }
856
857    /// Test unexpected character followed by multi-byte UTF-8 character.
858    #[test]
859    fn test_unexpected_before_multibyte() {
860        assert_snapshot!(lexemes("{$é}"), @r###"
861        L(false, LBrace, 0, "{")
862        L(false, Unexpected, 1, "$")
863        L(false, Unexpected, 2, "\xC3\xA9")
864        L(false, RBrace, 4, "}")
865        "###);
866    }
867
868    /// Test handling of various unexpected multi-byte UTF-8 characters in expression mode.
869    #[test]
870    fn test_unexpected_characters_utf8_safe() {
871        assert_snapshot!(lexemes("{$∑∞}"), @r###"
872        L(false, LBrace, 0, "{")
873        L(false, Unexpected, 1, "$")
874        L(false, Unexpected, 2, "\xE2\x88\x91")
875        L(false, Unexpected, 5, "\xE2\x88\x9E")
876        L(false, RBrace, 8, "}")
877        "###);
878    }
879
880    /// Test that ASCII-only whitespace handling works correctly. ASCII whitespace is recognized,
881    /// and non-ASCII whitespace is treated as unexpected.
882    #[test]
883    fn test_ascii_whitespace_only() {
884        assert_snapshot!(lexemes("{ \t\n\u{00A0}hello}"), @r###"
885        L(false, LBrace, 0, "{")
886        L(true, Unexpected, 4, "\xC2\xA0")
887        L(false, Ident, 6, "hello")
888        L(false, RBrace, 11, "}")
889        "###);
890    }
891
892    /// Test the UTF-8 boundary fallback when string ends with incomplete UTF-8.
893    /// This exercises the .unwrap_or(self.src.len()) fallback in the boundary detection.
894    #[test]
895    fn test_incomplete_utf8_boundary_fallback() {
896        // Create input with incomplete UTF-8: '{' + first byte of multi-byte sequence
897        let mut input = vec![b'{'];
898        input.push(0xC3); // First byte of multi-byte UTF-8 sequence (missing continuation)
899        let input_str = unsafe { std::str::from_utf8_unchecked(&input) };
900
901        // The boundary detection should fall back to src.len() when no boundary is found
902        assert_snapshot!(lexemes(input_str), @r###"
903        L(false, LBrace, 0, "{")
904        L(false, Unexpected, 1, "\xC3")
905        "###);
906    }
907}