tokenize function

List<Token> tokenize(
  1. String source, [
  2. PreprocessOptions options = const PreprocessOptions()
])

Generate a list of tokens from a source string.

Implementation

List<Token> tokenize(String source, [PreprocessOptions options = const PreprocessOptions()]) {
  final List<Token> tokens = [];
  final String src = preprocess(source, options);

  int cursorPosition = 0;
  int curlyBracketDepth = 0;

  String consumeWhile(bool Function(String char) predicate) {
    String str = "";
    while (predicate(src[cursorPosition])) {
      // Check for escaped characters
      if (src[cursorPosition] == r'\') {
        // Consume the backslash
        ++cursorPosition;
        // Check for end of input
        if (cursorPosition >= src.length) throw SyntaxError("Unexpected end of input");

        // Add the escaped character
        final escaped = src[cursorPosition++];
        final unescaped = ESCAPE_CHARACTERS[escaped];
        if (unescaped == null) {
          throw SyntaxError('Unexpected escaped character: $escaped');
        }
        str += unescaped;
        continue;
      }

      str += src[cursorPosition++];
      if (cursorPosition >= src.length) throw SyntaxError("Unexpected end of input");
    }
    return str;
  }

  // Build each token until end of input
  main: while (cursorPosition < src.length) {
    // First, consume all text that is outside of a Jinja statement or expression
    final lastTokenType = tokens.isNotEmpty ? tokens.last.type : null;
    if (lastTokenType == null ||
        lastTokenType == TokenType.CloseStatement ||
        lastTokenType == TokenType.CloseExpression ||
        lastTokenType == TokenType.Comment) {
      String text = "";
      while (cursorPosition < src.length &&
          // Keep going until we hit the next Jinja statement or expression
          !(
              src[cursorPosition] == '{' &&
                  (src[cursorPosition + 1] == "%" || src[cursorPosition + 1] == "{" || src[cursorPosition + 1] == "#")
          )) {
        text += src[cursorPosition++];
      }

      // There is some text to add
      if (text.isNotEmpty) {
        tokens.add(Token(text, TokenType.Text));
        continue;
      }
    }

    // Possibly consume a comment
    if (src[cursorPosition] == '{' && cursorPosition + 1 < src.length && src[cursorPosition + 1] == '#') {
      cursorPosition += 2; // Skip the opening {#

      String comment = "";
      while (cursorPosition + 1 < src.length &&
          (src[cursorPosition] != '#' || src[cursorPosition + 1] != '}')) {
        // Check for end of input
        if (cursorPosition + 2 >= src.length) {
          throw SyntaxError("Missing end of comment tag");
        }
        comment += src[cursorPosition++];
      }
      tokens.add(Token(comment, TokenType.Comment));
      cursorPosition += 2; // Skip the closing #}
      continue;
    }

    // Consume (and ignore) all whitespace inside Jinja statements or expressions
    consumeWhile((char) => RegExp(r'\s').hasMatch(char));

    // After consuming whitespace, we might be at the end of the input
    if (cursorPosition >= src.length) {
      break;
    }

    // Handle multi-character tokens
    final char = src[cursorPosition];

    // Check for unary operators
    if (char == '-' || char == '+') {
      final lastTokenType = tokens.isNotEmpty ? tokens.last.type : null;
      if (lastTokenType == null || lastTokenType == TokenType.Text) {
        throw SyntaxError('Unexpected character: $char');
      }
      switch (lastTokenType) {
        case TokenType.Identifier:
        case TokenType.NumericLiteral:
        case TokenType.StringLiteral:
        case TokenType.CloseParen:
        case TokenType.CloseSquareBracket:
        // Part of a binary operator
        // a - 1, 1 - 1, true - 1, "apple" - 1, (1) - 1, a[1] - 1
        // Continue parsing normally
          break;

        default:
        // Is part of a unary operator
        // (-1), [-1], (1 + -1), not -1, -apple
          cursorPosition++; // consume the unary operator

          // Check for numbers following the unary operator
          final num = consumeWhile(isInteger);
          tokens.add(Token(
            '$char$num',
            num.isNotEmpty
                ? TokenType.NumericLiteral
                : TokenType.UnaryOperator,
          ));
          continue;
      }
    }

    // Try to match one of the tokens in the mapping table
    for (final (seq, type) in ORDERED_MAPPING_TABLE) {
      // inside an object literal, don't treat "}}" as expression-end
      if (seq == '}}' && curlyBracketDepth > 0) {
        continue;
      }
      if (src.startsWith(seq, cursorPosition)) {
        tokens.add(Token(seq, type));

        // possibly adjust the curly bracket depth
        if (type == TokenType.OpenExpression) {
          curlyBracketDepth = 0;
        } else if (type == TokenType.OpenCurlyBracket) {
          curlyBracketDepth++;
        } else if (type == TokenType.CloseCurlyBracket) {
          curlyBracketDepth--;
        }
        cursorPosition += seq.length;
        continue main;
      }
    }

    if (char == "'" || char == '"') {
      cursorPosition++; // Skip the opening quote
      final str = consumeWhile((c) => c != char);
      tokens.add(Token(str, TokenType.StringLiteral));
      cursorPosition++; // Skip the closing quote
      continue;
    }

    if (isInteger(char)) {
      // Consume integer part
      String num = consumeWhile(isInteger);
      // Possibly, consume fractional part
      if (cursorPosition < src.length &&
          src[cursorPosition] == '.' &&
          cursorPosition + 1 < src.length &&
          isInteger(src[cursorPosition + 1])) {
        cursorPosition++; // consume '.'
        final frac = consumeWhile(isInteger);
        num = '$num.$frac';
      }
      tokens.add(Token(num, TokenType.NumericLiteral));
      continue;
    }
    if (isWord(char)) {
      // consume any word characters and always classify as Identifier
      final word = consumeWhile(isWord);
      tokens.add(Token(word, TokenType.Identifier));
      continue;
    }

    throw SyntaxError('Unexpected character: $char');
  }
  return tokens;
}