Skip to content

ripgrep crates/searcher/src/searcher/core.rs: Code Companion

Reference code for the Search Algorithm lecture. Sections correspond to the lecture document.


Section 1: The Core State Machine

#[derive(Debug)]
pub(crate) struct Core<'s, M: 's, S> {
    // Configuration and dependencies (borrowed)
    config: &'s Config,
    matcher: M,
    searcher: &'s Searcher,
    sink: S,

    // Binary detection state
    binary: bool,
    binary_byte_offset: Option<usize>,

    // Position tracking - dual granularity
    pos: usize,                    // Position within current buffer
    absolute_byte_offset: u64,     // Position in the overall file

    // Line number optimization
    line_number: Option<u64>,      // Current line number (if tracking)
    last_line_counted: usize,      // Checkpoint for incremental counting
    last_line_visited: usize,      // Last line we processed

    // Context and match state
    after_context_left: usize,     // Remaining after-context lines to emit
    has_sunk: bool,                // Have we output anything?
    has_matched: bool,             // Have we found any match?
    count: u64,                    // Total matches found
}

The struct is generic over M: Matcher and S: Sink, allowing different regex engines and output destinations. The 's lifetime ties borrowed references to the searcher's lifetime.

impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
    pub(crate) fn new(
        searcher: &'s Searcher,
        matcher: M,
        sink: S,
        binary: bool,
    ) -> Core<'s, M, S> {
        let line_number =
            if searcher.config.line_number { Some(1) } else { None };
        let core = Core {
            config: &searcher.config,
            matcher,
            searcher,
            sink,
            binary,
            pos: 0,
            absolute_byte_offset: 0,
            binary_byte_offset: None,
            line_number,
            last_line_counted: 0,
            last_line_visited: 0,
            after_context_left: 0,
            has_sunk: false,
            has_matched: false,
            count: 0,
        };
        // Log which search path will be used
        if !core.searcher.multi_line_with_matcher(&core.matcher) {
            if core.is_line_by_line_fast() {
                log::trace!("searcher core: will use fast line searcher");
            } else {
                log::trace!("searcher core: will use slow line searcher");
            }
        }
        core
    }
}

Line number tracking is optional (Option<u64>) to avoid counting overhead when not needed.


Section 2: The Fast Path vs. Slow Path Decision

fn is_line_by_line_fast(&self) -> bool {
    debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));

    // Passthrough mode requires slow path (must see every line)
    if self.config.passthru {
        return false;
    }
    // Stop-on-nonmatch after a match needs slow path
    if self.config.stop_on_nonmatch && self.has_matched {
        return false;
    }

    // Check if matcher understands line terminators
    if let Some(line_term) = self.matcher.line_terminator() {
        // FIXME: Workaround for grep-regex bug with NUL terminators
        // Line anchors like (?m:^) and (?m:$) don't work with \x00
        if line_term.as_byte() == b'\x00' {
            return false;
        }
        // Matcher's line terminator matches our config
        if line_term == self.config.line_term {
            return true;
        }
    }

    // Alternative: check non_matching_bytes
    if let Some(non_matching) = self.matcher.non_matching_bytes() {
        // For CRLF, we only care about \n (the required part)
        if non_matching.contains(self.config.line_term.as_byte()) {
            return true;
        }
    }
    false
}

The method returns true only when the matcher can correctly handle line boundaries during its search, enabling the optimized fast path.

pub(crate) fn match_by_line(
    &mut self,
    buf: &[u8],
) -> Result<bool, S::Error> {
    if self.is_line_by_line_fast() {
        match self.match_by_line_fast(buf)? {
            FastMatchResult::SwitchToSlow => self.match_by_line_slow(buf),
            FastMatchResult::Continue => Ok(true),
            FastMatchResult::Stop => Ok(false),
        }
    } else {
        self.match_by_line_slow(buf)
    }
}

The FastMatchResult enum allows the fast path to hand off to the slow path mid-search when conditions change.


Section 3: The Fast Line Search Algorithm

#[inline(always)]
fn find_by_line_fast(
    &mut self,
    buf: &[u8],
) -> Result<Option<Range>, S::Error> {
    debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
    debug_assert!(self.is_line_by_line_fast());

    let mut pos = self.pos();
    while !buf[pos..].is_empty() {
        if self.has_exceeded_match_limit() {
            return Ok(None);
        }
        // Ask matcher for candidate positions
        match self.matcher.find_candidate_line(&buf[pos..]) {
            Err(err) => return Err(S::Error::error_message(err)),
            Ok(None) => return Ok(None),  // No more candidates

            // Confirmed match - regex engine is certain
            Ok(Some(LineMatchKind::Confirmed(i))) => {
                let line = lines::locate(
                    buf,
                    self.config.line_term.as_byte(),
                    Range::zero(i).offset(pos),
                );
                // Edge case: match at buffer end isn't valid
                if line.start() == buf.len() {
                    pos = buf.len();
                    continue;
                }
                return Ok(Some(line));
            }

            // Candidate - needs verification
            Ok(Some(LineMatchKind::Candidate(i))) => {
                let line = lines::locate(
                    buf,
                    self.config.line_term.as_byte(),
                    Range::zero(i).offset(pos),
                );
                // Verify the candidate is a real match
                if self.is_match(&buf[line])? {
                    return Ok(Some(line));
                }
                // Not a match, advance past this line
                pos = line.end();
            }
        }
    }
    Ok(None)
}

The lines::locate function finds the full line containing a given position, handling the translation from match position to line boundaries.

fn is_match(&self, line: &[u8]) -> Result<bool, S::Error> {
    // Strip terminator to prevent regexes like (?m)^$ from
    // matching the empty position after the line terminator
    let line = lines::without_terminator(line, self.config.line_term);
    self.matcher.is_match(line).map_err(S::Error::error_message)
}

Section 4: Inverted Match Handling

#[inline(always)]
fn match_by_line_fast_invert(
    &mut self,
    buf: &[u8],
) -> Result<bool, S::Error> {
    assert!(self.config.invert_match);

    // Find the next matching line (which we want to exclude)
    let invert_match = match self.find_by_line_fast(buf)? {
        // No match found - everything from pos to end is non-matching
        None => {
            let range = Range::new(self.pos(), buf.len());
            self.set_pos(range.end());
            range
        }
        // Match found - non-matching region is from pos to match start
        Some(line) => {
            let range = Range::new(self.pos(), line.start());
            self.set_pos(line.end());  // Skip past the matching line
            range
        }
    };

    // Empty region means we hit a match immediately
    if invert_match.is_empty() {
        return Ok(true);
    }

    self.has_matched = true;

    // Handle context around the non-matching region
    if !self.after_context_by_line(buf, invert_match.start())? {
        return Ok(false);
    }
    if !self.before_context_by_line(buf, invert_match.start())? {
        return Ok(false);
    }

    // Iterate through each non-matching line
    let mut stepper = LineStep::new(
        self.config.line_term.as_byte(),
        invert_match.start(),
        invert_match.end(),
    );
    while let Some(line) = stepper.next_match(buf) {
        self.increment_count();  // Each line counts as a "match"
        if !self.sink_matched(buf, &line)? {
            return Ok(false);
        }
        // Check limit after each line
        if self.has_exceeded_match_limit() {
            return Ok(false);
        }
    }
    Ok(true)
}

The inverted match region boundaries are computed by finding where actual matches occur, then treating everything between as the result.


Section 5: The Slow Path for Complex Scenarios

fn match_by_line_slow(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
    debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));

    let range = Range::new(self.pos(), buf.len());
    let mut stepper = LineStep::new(
        self.config.line_term.as_byte(),
        range.start(),
        range.end(),
    );

    while let Some(line) = stepper.next_match(buf) {
        // Early exit if we've hit limits and have no more context to emit
        if self.has_exceeded_match_limit()
            && !self.config.passthru
            && self.after_context_left == 0
        {
            return Ok(false);
        }

        // Check for match (with line terminator stripped)
        let matched = {
            let slice = lines::without_terminator(
                &buf[line],
                self.config.line_term,
            );
            self.shortest_match(slice)?.is_some()
        };
        self.set_pos(line.end());

        // XOR with invert_match to get actual success
        let success = matched != self.config.invert_match;

        if success {
            // Case 1: Line matches (or doesn't match in invert mode)
            self.has_matched = true;
            self.increment_count();
            if !self.before_context_by_line(buf, line.start())? {
                return Ok(false);
            }
            if !self.sink_matched(buf, &line)? {
                return Ok(false);
            }
        } else if self.after_context_left >= 1 {
            // Case 2: Emitting after-context from previous match
            if !self.sink_after_context(buf, &line)? {
                return Ok(false);
            }
        } else if self.config.passthru {
            // Case 3: Passthrough mode - emit as "other" context
            if !self.sink_other_context(buf, &line)? {
                return Ok(false);
            }
        }

        // Case 4: Stop on first non-match after a match
        if self.config.stop_on_nonmatch && !success && self.has_matched {
            return Ok(false);
        }
    }
    Ok(true)
}

The shortest_match method is an optimization—we only need to know if a match exists, not its full extent.


Quick Reference

FastMatchResult Enum

Variant Meaning
Continue Keep processing, buffer consumed
Stop Stop searching entirely
SwitchToSlow Fall back to slow path

Key State Fields

Field Purpose
pos Current position in buffer
absolute_byte_offset Position in file (survives buffer rolls)
last_line_counted Checkpoint for incremental line counting
after_context_left Remaining context lines to emit
count Total matches (for --max-count)

LineMatchKind Variants

enum LineMatchKind {
    Confirmed(usize),  // Definite match at offset
    Candidate(usize),  // Possible match, needs verification
}

Fast Path Requirements

The fast path is enabled when: 1. passthru mode is disabled 2. stop_on_nonmatch hasn't triggered 3. Matcher advertises line terminator support OR non-matching bytes include the terminator 4. Line terminator is not NUL (\x00)