Skip to content

Gitignore Parsing: Code Companion

Reference code for the Gitignore Parsing lecture. Sections correspond to the lecture document.


Section 1: The Glob Data Structure

/// Glob represents a single glob in a gitignore file.
///
/// This is used to report information about the highest precedent glob that
/// matched in one or more gitignore files.
#[derive(Clone, Debug)]
pub struct Glob {
    /// The file path that this glob was extracted from.
    from: Option<PathBuf>,
    /// The original glob string (what user wrote).
    original: String,
    /// The actual glob string used to convert to a regex (transformed).
    actual: String,
    /// Whether this is a whitelisted glob or not (prefixed with !).
    is_whitelist: bool,
    /// Whether this glob should only match directories or not (suffixed with /).
    is_only_dir: bool,
}

impl Glob {
    /// Returns the file path that defined this glob.
    pub fn from(&self) -> Option<&Path> {
        self.from.as_ref().map(|p| &**p)
    }

    /// The original glob as it was defined in a gitignore file.
    pub fn original(&self) -> &str {
        &self.original
    }

    /// The actual glob that was compiled to respect gitignore semantics.
    pub fn actual(&self) -> &str {
        &self.actual
    }

    /// Whether this was a whitelisted glob or not.
    pub fn is_whitelist(&self) -> bool {
        self.is_whitelist
    }

    /// Whether this glob must match a directory or not.
    pub fn is_only_dir(&self) -> bool {
        self.is_only_dir
    }

    /// Returns true if and only if this glob has a `**/` prefix.
    fn has_doublestar_prefix(&self) -> bool {
        self.actual.starts_with("**/") || self.actual == "**"
    }
}

The original vs actual distinction preserves user input for diagnostics while storing the transformed pattern for matching. For example, foo/ becomes foo in actual with is_only_dir set to true.


Section 2: The Gitignore Matcher

/// Gitignore is a matcher for the globs in one or more gitignore files
/// in the same directory.
#[derive(Clone, Debug)]
pub struct Gitignore {
    /// The compiled glob set from the globset crate.
    set: GlobSet,
    /// The root directory for relative matching.
    root: PathBuf,
    /// All globs with their metadata.
    globs: Vec<Glob>,
    /// Count of ignore patterns (for optimization decisions).
    num_ignores: u64,
    /// Count of whitelist patterns (! prefixed).
    num_whitelists: u64,
    /// Object pool for scratch vectors during matching.
    matches: Option<Arc<Pool<Vec<usize>>>>,
}

impl Gitignore {
    /// Creates a new empty gitignore matcher that never matches anything.
    pub fn empty() -> Gitignore {
        Gitignore {
            set: GlobSet::empty(),
            root: PathBuf::from(""),
            globs: vec![],
            num_ignores: 0,
            num_whitelists: 0,
            matches: None,
        }
    }

    /// Returns true if and only if this gitignore has zero globs.
    pub fn is_empty(&self) -> bool {
        self.set.is_empty()
    }

    /// Returns the total number of globs.
    pub fn len(&self) -> usize {
        self.set.len()
    }
}

The matches field uses Arc<Pool<Vec<usize>>> for thread-safe scratch buffer reuse. The pool returns vectors that track which pattern indices matched, avoiding allocation on every match operation.


Section 3: Path Stripping and Relative Matching

impl Gitignore {
    /// Strips the given path such that it's suitable for matching with this
    /// gitignore matcher.
    fn strip<'a, P: 'a + AsRef<Path> + ?Sized>(
        &'a self,
        path: &'a P,
    ) -> &'a Path {
        let mut path = path.as_ref();

        // A leading ./ is completely superfluous. We also strip it from
        // our gitignore root path, so we need to strip it from our candidate
        // path too.
        if let Some(p) = strip_prefix("./", path) {
            path = p;
        }

        // Strip any common prefix between the candidate path and the root
        // of the gitignore, to make sure we get relative matching right.
        // BUT, a file name might not have any directory components to it,
        // in which case, we don't want to accidentally strip any part of the
        // file name.
        //
        // As an additional special case, if the root is just `.`, then we
        // shouldn't try to strip anything, e.g., when path begins with a `.`.
        if self.root != Path::new(".") && !is_file_name(path) {
            if let Some(p) = strip_prefix(&self.root, path) {
                path = p;
                // If we're left with a leading slash, get rid of it.
                if let Some(p) = strip_prefix("/", path) {
                    path = p;
                }
            }
        }
        path
    }
}

The is_file_name check prevents stripping part of a bare filename that happens to match the root prefix. The lifetime 'a ensures the returned path slice borrows from the input.


Section 4: The Matching Algorithm

impl Gitignore {
    /// Returns whether the given path matched a pattern in this gitignore.
    pub fn matched<P: AsRef<Path>>(
        &self,
        path: P,
        is_dir: bool,
    ) -> Match<&Glob> {
        if self.is_empty() {
            return Match::None;
        }
        self.matched_stripped(self.strip(path.as_ref()), is_dir)
    }

    /// Like matched, but takes a path that has already been stripped.
    fn matched_stripped<P: AsRef<Path>>(
        &self,
        path: P,
        is_dir: bool,
    ) -> Match<&Glob> {
        if self.is_empty() {
            return Match::None;
        }
        let path = path.as_ref();

        // Get a scratch vector from the pool.
        let mut matches = self.matches.as_ref().unwrap().get();
        let candidate = Candidate::new(path);

        // Fill the vector with indices of all matching patterns.
        self.set.matches_candidate_into(&candidate, &mut *matches);

        // Iterate in REVERSE order: last pattern wins (highest precedence).
        for &i in matches.iter().rev() {
            let glob = &self.globs[i];
            // Directory-only patterns must match directories.
            if !glob.is_only_dir() || is_dir {
                return if glob.is_whitelist() {
                    Match::Whitelist(glob)
                } else {
                    Match::Ignore(glob)
                };
            }
        }
        Match::None
    }
}

The reverse iteration implements gitignore's "last pattern wins" semantics. The is_dir parameter is filesystem knowledge that the glob matcher can't determine from the pattern alone.


Section 5: The Builder Pattern

/// Builds a matcher for a single set of globs from a .gitignore file.
#[derive(Clone, Debug)]
pub struct GitignoreBuilder {
    builder: GlobSetBuilder,
    root: PathBuf,
    globs: Vec<Glob>,
    case_insensitive: bool,
    /// Defaults to true to match git's permissive behavior.
    allow_unclosed_class: bool,
}

impl GitignoreBuilder {
    /// Create a new builder for a gitignore file.
    pub fn new<P: AsRef<Path>>(root: P) -> GitignoreBuilder {
        let root = root.as_ref();
        GitignoreBuilder {
            builder: GlobSetBuilder::new(),
            // Strip ./ prefix during construction, not on every match.
            root: strip_prefix("./", root).unwrap_or(root).to_path_buf(),
            globs: vec![],
            case_insensitive: false,
            allow_unclosed_class: true,  // Match git's behavior
        }
    }

    /// Builds a new matcher from the globs added so far.
    pub fn build(&self) -> Result<Gitignore, Error> {
        let nignore = self.globs.iter().filter(|g| !g.is_whitelist()).count();
        let nwhite = self.globs.iter().filter(|g| g.is_whitelist()).count();
        let set = self
            .builder
            .build()
            .map_err(|err| Error::Glob { glob: None, err: err.to_string() })?;
        Ok(Gitignore {
            set,
            root: self.root.clone(),
            globs: self.globs.clone(),
            num_ignores: nignore as u64,
            num_whitelists: nwhite as u64,
            matches: Some(Arc::new(Pool::new(|| vec![]))),
        })
    }
}

The builder wraps a GlobSetBuilder internally, maintaining the abstraction layer. The pool is created at build time with a closure that produces empty vectors on demand.


Section 6: Parsing Individual Lines

impl GitignoreBuilder {
    /// Add a line from a gitignore file to this builder.
    pub fn add_line(
        &mut self,
        from: Option<PathBuf>,
        mut line: &str,
    ) -> Result<&mut GitignoreBuilder, Error> {
        // Skip comments.
        if line.starts_with("#") {
            return Ok(self);
        }
        // Trim trailing whitespace unless escaped.
        if !line.ends_with("\\ ") {
            line = line.trim_right();
        }
        // Skip blank lines.
        if line.is_empty() {
            return Ok(self);
        }

        let mut glob = Glob {
            from,
            original: line.to_string(),
            actual: String::new(),
            is_whitelist: false,
            is_only_dir: false,
        };
        let mut is_absolute = false;

        // Handle escaped ! or # at start (literal match).
        if line.starts_with("\\!") || line.starts_with("\\#") {
            line = &line[1..];
            is_absolute = line.chars().nth(0) == Some('/');
        } else {
            // Check for whitelist prefix.
            if line.starts_with("!") {
                glob.is_whitelist = true;
                line = &line[1..];
            }
            // Leading slash means anchor to root.
            if line.starts_with("/") {
                line = &line[1..];
                is_absolute = true;
            }
        }

        // Trailing slash means directory-only match.
        if line.as_bytes().last() == Some(&b'/') {
            glob.is_only_dir = true;
            line = &line[..line.len() - 1];
            // Handle escaped trailing slash.
            if line.as_bytes().last() == Some(&b'\\') {
                line = &line[..line.len() - 1];
            }
        }

        glob.actual = line.to_string();

        // Patterns without / match anywhere (add **/ prefix).
        if !is_absolute && !line.chars().any(|c| c == '/') {
            if !glob.has_doublestar_prefix() {
                glob.actual = format!("**/{}", glob.actual);
            }
        }

        // Fix /** to not match the directory itself.
        if glob.actual.ends_with("/**") {
            glob.actual = format!("{}/*", glob.actual);
        }

        // Build the actual glob with gitignore-specific options.
        let parsed = GlobBuilder::new(&glob.actual)
            .literal_separator(true)      // * doesn't match /
            .case_insensitive(self.case_insensitive)
            .backslash_escape(true)       // \ escapes special chars
            .allow_unclosed_class(self.allow_unclosed_class)
            .build()
            .map_err(|err| Error::Glob {
                glob: Some(glob.original.clone()),
                err: err.kind().to_string(),
            })?;

        self.builder.add(parsed);
        self.globs.push(glob);
        Ok(self)
    }
}

The transformation order matters: whitelist detection happens before slash stripping, and **/ prefixing happens after both. The literal_separator(true) ensures * won't match path separators, matching gitignore semantics.


Quick Reference

Pattern Transformations

Input Pattern Transformed Pattern Semantics
foo **/foo Match anywhere in tree
/foo foo Match only at root
foo/ foo + is_only_dir=true Match directories only
!foo **/foo + is_whitelist=true Un-ignore pattern
\!foo **/!foo Literal ! in name
foo/** foo/**/* Match contents, not directory

Match Result Types

pub enum Match<T> {
    None,           // No pattern matched
    Ignore(T),      // Should be ignored
    Whitelist(T),   // Explicitly un-ignored
}

Key Builder Options

Option Default Purpose
case_insensitive false Case-insensitive matching
allow_unclosed_class true Treat [abc as literal (git compat)

Precedence Rules

  1. Last matching pattern wins
  2. Whitelist (!) can override earlier ignores
  3. Directory-only patterns (foo/) only match if is_dir=true
  4. Patterns in child directories override parent patterns