Skip to content

ripgrep crates/regex/src/ast.rs: Code Companion

Reference code for the AST Manipulation lecture. Sections correspond to the lecture document.


Section 1: The Analysis Result Structure

use regex_syntax::ast::{self, Ast};

/// The results of analyzing AST of a regular expression (e.g., for supporting
/// smart case).
#[derive(Clone, Debug)]
pub(crate) struct AstAnalysis {
    /// True if and only if a literal uppercase character occurs in the regex.
    any_uppercase: bool,
    /// True if and only if the regex contains any literal at all.
    any_literal: bool,
}

The pub(crate) visibility restricts this struct to the current crate only. The regex_syntax crate provides the AST types we'll traverse.


Section 2: The Entry Points

impl AstAnalysis {
    /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`.
    ///
    /// If `pattern` is not a valid regular expression, then `None` is
    /// returned.
    #[cfg(test)]  // Only compiled in test builds
    pub(crate) fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
        regex_syntax::ast::parse::Parser::new()
            .parse(pattern)           // Returns Result<Ast, Error>
            .map(|ast| AstAnalysis::from_ast(&ast))  // Transform Ok variant
            .ok()                     // Convert Result to Option, discarding error
    }

    /// Perform an AST analysis given the AST.
    pub(crate) fn from_ast(ast: &Ast) -> AstAnalysis {
        let mut analysis = AstAnalysis::new();  // Start with both flags false
        analysis.from_ast_impl(ast);            // Mutate during traversal
        analysis
    }
}

The from_pattern method chains parser creation, parsing, mapping, and error handling in a single expression. The from_ast method follows a common Rust pattern: create default state, mutate it, return it.


Section 3: The Accessor Methods

impl AstAnalysis {
    /// Returns true if and only if a literal uppercase character occurs in
    /// the pattern.
    ///
    /// For example, a pattern like `\pL` contains no uppercase literals,
    /// even though `L` is uppercase and the `\pL` class contains uppercase
    /// characters.
    pub(crate) fn any_uppercase(&self) -> bool {
        self.any_uppercase
    }

    /// Returns true if and only if the regex contains any literal at all.
    ///
    /// For example, a pattern like `\pL` reports `false`, but a pattern like
    /// `\pLfoo` reports `true`.
    pub(crate) fn any_literal(&self) -> bool {
        self.any_literal
    }

    /// Creates a new `AstAnalysis` value with an initial configuration.
    fn new() -> AstAnalysis {
        AstAnalysis { any_uppercase: false, any_literal: false }
    }
}

Accessors return bool directly rather than &bool since bool is Copy. The new() constructor is private—callers must use from_ast or from_pattern.


Section 4: The Tree Traversal Strategy

fn from_ast_impl(&mut self, ast: &Ast) {
    // Early termination: stop if we've found everything
    if self.done() {
        return;
    }
    match *ast {
        // Nodes with no literals - do nothing
        Ast::Empty(_) => {}
        Ast::Flags(_)
        | Ast::Dot(_)
        | Ast::Assertion(_)
        | Ast::ClassUnicode(_)    // \pL, \p{Greek}, etc.
        | Ast::ClassPerl(_) => {} // \d, \w, \s, etc.

        // Nodes containing literals directly
        Ast::Literal(ref x) => {
            self.from_ast_literal(x);
        }
        Ast::ClassBracketed(ref x) => {
            self.from_ast_class_set(&x.kind);  // [a-z], [^0-9], etc.
        }

        // Nodes containing sub-trees - recurse
        Ast::Repetition(ref x) => {
            self.from_ast_impl(&x.ast);  // a*, a+, a{3}, etc.
        }
        Ast::Group(ref x) => {
            self.from_ast_impl(&x.ast);  // (a), (?:a), (?P<name>a)
        }
        Ast::Alternation(ref alt) => {
            for x in &alt.asts {          // a|b|c
                self.from_ast_impl(x);
            }
        }
        Ast::Concat(ref alt) => {
            for x in &alt.asts {          // abc (sequence)
                self.from_ast_impl(x);
            }
        }
    }
}

The match *ast dereferences to match on the enum directly. The ref keyword borrows the inner data without moving it out of the AST.


Section 5: Handling Character Classes

fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
    if self.done() {
        return;
    }
    match *ast {
        ast::ClassSet::Item(ref item) => {
            self.from_ast_class_set_item(item);
        }
        ast::ClassSet::BinaryOp(ref x) => {
            // Intersection: [a-z&&[aeiou]]
            // Subtraction: [a-z--[aeiou]]
            self.from_ast_class_set(&x.lhs);
            self.from_ast_class_set(&x.rhs);
        }
    }
}

fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
    if self.done() {
        return;
    }
    match *ast {
        // Built-in classes have no user literals
        ast::ClassSetItem::Empty(_)
        | ast::ClassSetItem::Ascii(_)   // [:alpha:], [:digit:]
        | ast::ClassSetItem::Unicode(_) // \p{Greek}
        | ast::ClassSetItem::Perl(_) => {}  // \d, \w

        // Single literal in class: [a] or [abc]
        ast::ClassSetItem::Literal(ref x) => {
            self.from_ast_literal(x);
        }
        // Range: [a-z] has two literals (start and end)
        ast::ClassSetItem::Range(ref x) => {
            self.from_ast_literal(&x.start);
            self.from_ast_literal(&x.end);
        }
        // Nested brackets: [[a-z][0-9]]
        ast::ClassSetItem::Bracketed(ref x) => {
            self.from_ast_class_set(&x.kind);
        }
        // Union of items: [abc0-9]
        ast::ClassSetItem::Union(ref union) => {
            for x in &union.items {
                self.from_ast_class_set_item(x);
            }
        }
    }
}

Character classes have their own mini-AST with items, ranges, and operations. The traversal mirrors this structure with dedicated methods.


Section 6: Literal Detection

fn from_ast_literal(&mut self, ast: &ast::Literal) {
    self.any_literal = true;
    // is_uppercase() handles full Unicode, not just ASCII
    self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
}

The ast.c field is a char. The || short-circuits: once any_uppercase is true, is_uppercase() won't be called.


Section 7: The Early Termination Condition

/// Returns true if and only if the attributes can never change no matter
/// what other AST it might see.
fn done(&self) -> bool {
    self.any_uppercase && self.any_literal
}

Both flags must be true to stop early. This is a monotonic analysis—flags only transition false → true, never backwards.


Section 8: Test Coverage and Edge Cases

#[cfg(test)]
mod tests {
    use super::*;

    fn analysis(pattern: &str) -> AstAnalysis {
        AstAnalysis::from_pattern(pattern).unwrap()
    }

    #[test]
    fn various() {
        // Empty pattern: no literals at all
        let x = analysis("");
        assert!(!x.any_uppercase);
        assert!(!x.any_literal);

        // Lowercase only
        let x = analysis("foo");
        assert!(!x.any_uppercase);
        assert!(x.any_literal);

        // Uppercase present
        let x = analysis("Foo");
        assert!(x.any_uppercase);
        assert!(x.any_literal);

        // Key distinction: \S (escape) vs \\S (literal backslash + S)
        let x = analysis(r"foo\S");      // \S is a metacharacter
        assert!(!x.any_uppercase);
        assert!(x.any_literal);

        let x = analysis(r"foo\\S");     // Literal \ followed by literal S
        assert!(x.any_uppercase);
        assert!(x.any_literal);

        // Unicode property class - no literals despite uppercase L
        let x = analysis(r"\p{Ll}");
        assert!(!x.any_uppercase);
        assert!(!x.any_literal);

        // Character class ranges: [A-Z] contains uppercase boundaries
        let x = analysis(r"foo[A-Z]");
        assert!(x.any_uppercase);
        assert!(x.any_literal);

        // Unicode escape: \u0061 is 'a', still counts as literal
        let x = analysis(r"a\u0061");
        assert!(!x.any_uppercase);
        assert!(x.any_literal);
    }
}

The r"..." raw string syntax avoids double-escaping backslashes. Each test case targets a specific edge case in the analysis logic.


Quick Reference

AstAnalysis API

Method Returns Purpose
from_ast(&Ast) AstAnalysis Production entry point
from_pattern(&str) Option<AstAnalysis> Test-only convenience method
any_uppercase(&self) bool Check for uppercase literals
any_literal(&self) bool Check for any literals

AST Node Categories

Category Examples Contains Literals?
Empty/Meta Empty, Dot, Assertion No
Classes ClassUnicode, ClassPerl No (built-in)
Literal Literal Yes
Bracketed ClassBracketed Depends on contents
Containers Repetition, Group, Alternation, Concat Recurse to check

Smart Case Behavior

Pattern     any_uppercase  any_literal  Case Mode
─────────────────────────────────────────────────
foo         false          true         Insensitive
Foo         true           true         Sensitive
\pL         false          false        (no literals)
[A-Z]       true           true         Sensitive
\S          false          false        (no literals)
\\S         true           true         Sensitive