Skip to content

ripgrep crates/globset/src/glob.rs: Code Companion

Reference code for the Glob Implementation lecture. Sections correspond to the lecture document.


Section 1: The Match Strategy Optimization

/// Describes a matching strategy for a particular pattern.
///
/// This provides a way to more quickly determine whether a pattern matches
/// a particular file path in a way that scales with a large number of
/// patterns.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) enum MatchStrategy {
    /// Pattern matches if and only if the entire file path matches this literal
    Literal(String),
    /// Pattern matches if and only if the file path's basename matches this literal
    BasenameLiteral(String),
    /// Pattern matches if and only if the file path's extension matches this literal
    Extension(String),
    /// Pattern matches if and only if this prefix literal is a prefix of the path
    Prefix(String),
    /// Pattern matches if and only if this suffix literal is a suffix of the path
    Suffix {
        suffix: String,
        /// Whether this must start at the beginning of a path component
        component: bool,
    },
    /// Extension match is necessary but NOT sufficient - still need regex
    RequiredExtension(String),
    /// No optimization available - use full regex matching
    Regex,
}

impl MatchStrategy {
    /// Returns a matching strategy for the given pattern.
    /// Tries optimizations in order of specificity/speed.
    pub(crate) fn new(pat: &Glob) -> MatchStrategy {
        // Try each optimization in order - first match wins
        if let Some(lit) = pat.basename_literal() {
            MatchStrategy::BasenameLiteral(lit)
        } else if let Some(lit) = pat.literal() {
            MatchStrategy::Literal(lit)
        } else if let Some(ext) = pat.ext() {
            MatchStrategy::Extension(ext)
        } else if let Some(prefix) = pat.prefix() {
            MatchStrategy::Prefix(prefix)
        } else if let Some((suffix, component)) = pat.suffix() {
            MatchStrategy::Suffix { suffix, component }
        } else if let Some(ext) = pat.required_ext() {
            MatchStrategy::RequiredExtension(ext)
        } else {
            MatchStrategy::Regex
        }
    }
}

The ordering in new reflects optimization priority: basename literal is checked first because patterns like **/Makefile are extremely common in .gitignore files and can be matched with a single string comparison.


Section 2: The Glob Type and Builder Pattern

/// Glob represents a successfully parsed shell glob pattern.
#[derive(Clone, Eq)]
pub struct Glob {
    glob: String,      // Original pattern text
    re: String,        // Generated regex string
    opts: GlobOptions, // Parsing/matching options
    tokens: Tokens,    // Parsed token representation
}

// Only compare pattern and options - not internal representation
impl PartialEq for Glob {
    fn eq(&self, other: &Glob) -> bool {
        self.glob == other.glob && self.opts == other.opts
    }
}

impl std::hash::Hash for Glob {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.glob.hash(state);
        self.opts.hash(state);
    }
}

// Compact debug by default, expanded with {:#?}
impl std::fmt::Debug for Glob {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if f.alternate() {
            // {:#?} shows all internals for debugging
            f.debug_struct("Glob")
                .field("glob", &self.glob)
                .field("re", &self.re)
                .field("opts", &self.opts)
                .field("tokens", &self.tokens)
                .finish()
        } else {
            // {:?} shows just the pattern
            f.debug_tuple("Glob").field(&self.glob).finish()
        }
    }
}
/// A builder for a pattern - lifetime tied to input pattern string.
#[derive(Clone, Debug)]
pub struct GlobBuilder<'a> {
    glob: &'a str,       // Borrowed pattern text
    opts: GlobOptions,   // Accumulated options
}

#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
struct GlobOptions {
    case_insensitive: bool,
    literal_separator: bool,  // When true, * and ? won't match /
    backslash_escape: bool,   // Platform-dependent default
    empty_alternates: bool,   // Allow {,a} to match "" and "a"
    allow_unclosed_class: bool,
}

impl GlobOptions {
    fn default() -> GlobOptions {
        GlobOptions {
            case_insensitive: false,
            literal_separator: false,
            // On Unix, \ is escape; on Windows, \ is path separator
            backslash_escape: !is_separator('\\'),
            empty_alternates: false,
            allow_unclosed_class: false,
        }
    }
}

The backslash_escape default uses is_separator('\\') to detect the platform: on Windows where \ is a path separator, escaping is disabled by default.


Section 3: The Token Representation

/// Wrapper type around Vec<Token> for type clarity
#[derive(Clone, Debug, Default, Eq, PartialEq)]
struct Tokens(Vec<Token>);

// Deref allows using Tokens as a Vec<Token>
impl std::ops::Deref for Tokens {
    type Target = Vec<Token>;
    fn deref(&self) -> &Vec<Token> {
        &self.0
    }
}

impl std::ops::DerefMut for Tokens {
    fn deref_mut(&mut self) -> &mut Vec<Token> {
        &mut self.0
    }
}

#[derive(Clone, Debug, Eq, PartialEq)]
enum Token {
    /// A literal character to match exactly
    Literal(char),
    /// ? - matches any single character
    Any,
    /// * - matches zero or more characters
    ZeroOrMore,
    /// **/ at pattern start - matches any prefix path
    RecursivePrefix,
    /// /** at pattern end - matches any suffix path
    RecursiveSuffix,
    /// /**/ in middle - must match at least one /
    RecursiveZeroOrMore,
    /// Character class like [a-z] or [!0-9]
    Class { negated: bool, ranges: Vec<(char, char)> },
    /// Alternation like {foo,bar,baz}
    Alternates(Vec<Tokens>),
}

The three recursive variants (RecursivePrefix, RecursiveSuffix, RecursiveZeroOrMore) capture the semantic differences of ** in different positions, which affects both regex generation and optimization extraction.


Section 4: Strategy Extraction Methods

impl Glob {
    /// Returns pattern as literal only if it contains no wildcards.
    fn literal(&self) -> Option<String> {
        // Case insensitive requires regex machinery
        if self.opts.case_insensitive {
            return None;
        }
        let mut lit = String::new();
        for t in &*self.tokens {
            // Bail if anything other than a literal
            let Token::Literal(c) = *t else { return None };
            lit.push(c);
        }
        if lit.is_empty() { None } else { Some(lit) }
    }

    /// Returns extension for patterns like *.ext or **/*.ext
    fn ext(&self) -> Option<String> {
        if self.opts.case_insensitive {
            return None;
        }
        // Check for optional **/ prefix
        let start = match *self.tokens.get(0)? {
            Token::RecursivePrefix => 1,
            _ => 0,
        };
        // Must have * that can match path separators
        match *self.tokens.get(start)? {
            Token::ZeroOrMore => {
                // Without **, * must be able to match /
                if start == 0 && self.opts.literal_separator {
                    return None;
                }
            }
            _ => return None,
        }
        // Must have literal dot
        match *self.tokens.get(start + 1)? {
            Token::Literal('.') => {}
            _ => return None,
        }
        // Rest must be literals without . or /
        let mut lit = ".".to_string();
        for t in self.tokens[start + 2..].iter() {
            match *t {
                Token::Literal('.') | Token::Literal('/') => return None,
                Token::Literal(c) => lit.push(c),
                _ => return None,
            }
        }
        if lit.is_empty() { None } else { Some(lit) }
    }

    /// Returns tokens for basename-only matching (e.g., **/filename)
    fn basename_tokens(&self) -> Option<&[Token]> {
        if self.opts.case_insensitive {
            return None;
        }
        let start = match *self.tokens.get(0)? {
            Token::RecursivePrefix => 1,
            _ => {
                // Without ** prefix, can't assume basename-only is correct
                return None;
            }
        };
        if self.tokens[start..].is_empty() {
            return None;
        }
        // Check remaining tokens don't escape basename
        for t in self.tokens[start..].iter() {
            match *t {
                Token::Literal('/') => return None,
                Token::Literal(_) => {} // OK
                Token::Any | Token::ZeroOrMore => {
                    if !self.opts.literal_separator {
                        // * or ? could match /, escaping basename
                        return None;
                    }
                }
                Token::RecursivePrefix
                | Token::RecursiveSuffix
                | Token::RecursiveZeroOrMore => return None,
                Token::Class { .. } | Token::Alternates(..) => {
                    // Too complex - give up on optimization
                    return None;
                }
            }
        }
        // Return slice to avoid allocation
        Some(&self.tokens[start..])
    }
}

Note that basename_tokens returns &[Token] rather than allocating a new Vec—this is efficient since we're just providing a view into existing data.


Section 5: The Parser Structure

struct Parser<'a> {
    /// The glob pattern being parsed
    glob: &'a str,
    /// Stack tracking where alternation groups begin
    alternates_stack: Vec<usize>,
    /// Active branches - tokens added to last one
    branches: Vec<Tokens>,
    /// Character iterator with peek capability
    chars: std::iter::Peekable<std::str::Chars<'a>>,
    /// Previous character (for context like detecting /**)
    prev: Option<char>,
    /// Current character
    cur: Option<char>,
    /// Tracks if we found unclosed [ (optimization to avoid quadratic time)
    found_unclosed_class: bool,
    /// Options that may influence parsing
    opts: &'a GlobOptions,
}

impl<'a> Parser<'a> {
    fn parse(&mut self) -> Result<(), Error> {
        while let Some(c) = self.bump() {
            match c {
                '?' => self.push_token(Token::Any)?,
                '*' => self.parse_star()?,
                '[' if !self.found_unclosed_class => self.parse_class()?,
                '{' => self.push_alternate()?,
                '}' => self.pop_alternate()?,
                ',' => self.parse_comma()?,
                '\\' => self.parse_backslash()?,
                c => self.push_token(Token::Literal(c))?,
            }
        }
        Ok(())
    }

    fn push_alternate(&mut self) -> Result<(), Error> {
        // Mark where this alternation starts
        self.alternates_stack.push(self.branches.len());
        // Start first branch of this alternation
        self.branches.push(Tokens::default());
        Ok(())
    }

    fn pop_alternate(&mut self) -> Result<(), Error> {
        let Some(start) = self.alternates_stack.pop() else {
            return Err(self.error(ErrorKind::UnopenedAlternates));
        };
        // Collect all branches since start into Alternates token
        let alts = Token::Alternates(self.branches.drain(start..).collect());
        self.push_token(alts)?;
        Ok(())
    }

    fn parse_comma(&mut self) -> Result<(), Error> {
        // Comma is special only inside alternation
        if self.alternates_stack.is_empty() {
            self.push_token(Token::Literal(','))
        } else {
            // Start new branch at same nesting level
            Ok(self.branches.push(Tokens::default()))
        }
    }
}

The alternates_stack and branches work together: when { is encountered, the current branch count is pushed to the stack. When } is seen, all branches since that index are collected into an Alternates token.


Quick Reference

Token Types

Token Pattern Meaning
Literal(c) a, b, / Match exact character
Any ? Match any single character
ZeroOrMore * Match zero or more characters
RecursivePrefix **/ Match any directory prefix
RecursiveSuffix /** Match any directory suffix
RecursiveZeroOrMore /**/ Match one or more directories
Class [a-z] Match character in class
Alternates {a,b} Match any alternative

Match Strategy Selection

Pattern           → Strategy
─────────────────────────────────────
**/Makefile       → BasenameLiteral
foo/bar           → Literal  
*.rs              → Extension
build/**          → Prefix
**/test.js        → Suffix (component=true)
**/*.test.js      → RequiredExtension
**/*_test/**/*.c  → Regex

Key Builder Options

Option Default Effect
case_insensitive false Enable case-insensitive matching
literal_separator false Require / to match path separator
backslash_escape platform Enable \ as escape character
empty_alternates false Allow {,a} to match empty string
allow_unclosed_class false Treat [ without ] as literal