Skip to content

ripgrep crates/printer/src/util.rs: Code Companion

Reference code for the Printer Utilities lecture. Sections correspond to the lecture document.


Section 1: The Replacer Pattern for Amortized Allocation

/// A type for handling replacements while amortizing allocation.
pub(crate) struct Replacer<M: Matcher> {
    // Option allows lazy initialization - None until first use
    space: Option<Space<M>>,
}

struct Space<M: Matcher> {
    /// The place to store capture locations.
    caps: M::Captures,  // Generic over matcher's capture type
    /// The place to write a replacement to.
    dst: Vec<u8>,
    /// The place to store match offsets in terms of `dst`.
    matches: Vec<Match>,
}

impl<M: Matcher> Replacer<M> {
    /// Create a new replacer for use with a particular matcher.
    ///
    /// This constructor does not allocate. Instead, space for dealing with
    /// replacements is allocated lazily only when needed.
    pub(crate) fn new() -> Replacer<M> {
        Replacer { space: None }  // Zero allocation at construction
    }

    /// Allocate space for replacements when used with the given matcher and
    /// return a mutable reference to that space.
    fn allocate(&mut self, matcher: &M) -> io::Result<&mut Space<M>> {
        if self.space.is_none() {
            // First allocation happens here - captures come from matcher
            let caps =
                matcher.new_captures().map_err(io::Error::error_message)?;
            self.space = Some(Space { caps, dst: vec![], matches: vec![] });
        }
        Ok(self.space.as_mut().unwrap())
    }

    /// Clear space used for performing a replacement.
    pub(crate) fn clear(&mut self) {
        if let Some(ref mut space) = self.space {
            // Clear contents but retain allocated capacity
            space.dst.clear();
            space.matches.clear();
        }
    }
}

The M::Captures associated type allows each matcher implementation to define its own capture storage format, maintaining type safety without runtime dispatch.


Section 2: Multi-Line Mode and the Look-Ahead Problem

pub(crate) fn replace_all<'a>(
    &'a mut self,
    searcher: &Searcher,
    matcher: &M,
    mut haystack: &[u8],
    range: std::ops::Range<usize>,
    replacement: &[u8],
) -> io::Result<()> {
    // Detect if we're in multi-line mode to handle look-ahead correctly
    let is_multi_line = searcher.multi_line_with_matcher(&matcher);

    // Get the line_terminator that was removed (if any) so we can add it back
    let line_terminator = if is_multi_line {
        // In multi-line mode: extend buffer for look-ahead
        if haystack[range.end..].len() >= MAX_LOOK_AHEAD {
            haystack = &haystack[..range.end + MAX_LOOK_AHEAD];
        }
        &[]  // No terminator removed
    } else {
        // In single-line mode: remove terminator to prevent false negatives
        let mut m = Match::new(0, range.end);
        let line_terminator =
            trim_line_terminator(searcher, haystack, &mut m);
        haystack = &haystack[..m.end()];
        line_terminator  // Save to restore later
    };

    // ... replacement logic, then restore line_terminator at the end
}
/// Given a buf and some bounds, if there is a line terminator at the end of
/// the given bounds in buf, then the bounds are trimmed to remove the line
/// terminator, returning the slice of the removed line terminator (if any).
pub(crate) fn trim_line_terminator<'b>(
    searcher: &Searcher,
    buf: &'b [u8],
    line: &mut Match,
) -> &'b [u8] {
    let lineterm = searcher.line_terminator();
    if lineterm.is_suffix(&buf[*line]) {
        let mut end = line.end() - 1;
        // Handle CRLF as a two-byte terminator
        if lineterm.is_crlf() && end > 0 && buf.get(end - 1) == Some(&b'\r') {
            end -= 1;
        }
        let orig_end = line.end();
        *line = line.with_end(end);
        &buf[end..orig_end]  // Return the trimmed terminator
    } else {
        &[]
    }
}

The MAX_LOOK_AHEAD constant (defined elsewhere) caps how much extra context is provided, preventing unbounded buffer growth while still supporting most practical look-ahead patterns.


Section 3: The Sunk Abstraction for Unified Match Handling

/// A simple layer of abstraction over either a match or a contextual line
/// reported by the searcher.
#[derive(Debug)]
pub(crate) struct Sunk<'a> {
    bytes: &'a [u8],
    absolute_byte_offset: u64,
    line_number: Option<u64>,
    context_kind: Option<&'a SinkContextKind>,  // None for matches, Some for context
    matches: &'a [Match],           // Positions in (possibly replaced) output
    original_matches: &'a [Match],  // Positions in original text
}

impl<'a> Sunk<'a> {
    #[inline]
    pub(crate) fn from_sink_match(
        sunk: &'a SinkMatch<'a>,
        original_matches: &'a [Match],
        replacement: Option<(&'a [u8], &'a [Match])>,
    ) -> Sunk<'a> {
        // Use replacement data if present, otherwise original
        let (bytes, matches) =
            replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
        Sunk {
            bytes,
            absolute_byte_offset: sunk.absolute_byte_offset(),
            line_number: sunk.line_number(),
            context_kind: None,  // This is a match, not context
            matches,
            original_matches,
        }
    }

    #[inline]
    pub(crate) fn from_sink_context(
        sunk: &'a SinkContext<'a>,
        original_matches: &'a [Match],
        replacement: Option<(&'a [u8], &'a [Match])>,
    ) -> Sunk<'a> {
        let (bytes, matches) =
            replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
        Sunk {
            bytes,
            absolute_byte_offset: sunk.absolute_byte_offset(),
            line_number: sunk.line_number(),
            context_kind: Some(sunk.kind()),  // Before, After, or Passthrough
            matches,
            original_matches,
        }
    }

    /// Iterate over lines using the given terminator byte
    #[inline]
    pub(crate) fn lines(&self, line_term: u8) -> LineIter<'a> {
        LineIter::new(line_term, self.bytes())
    }
}

The 'a lifetime ties all borrowed data to the same scope, ensuring the Sunk value cannot outlive the search results it references.


Section 4: Cross-Platform Path Handling

#[derive(Clone, Debug)]
pub(crate) struct PrinterPath<'a> {
    // Only stored on Windows - Unix can reconstruct from bytes
    #[cfg(not(unix))]
    path: &'a Path,
    bytes: Cow<'a, [u8]>,  // Borrowed when possible, owned after transforms
    hyperlink: OnceCell<Option<HyperlinkPath>>,  // Lazy, computed once
}

impl<'a> PrinterPath<'a> {
    pub(crate) fn new(path: &'a Path) -> PrinterPath<'a> {
        PrinterPath {
            #[cfg(not(unix))]
            path,
            // Zero-cost on Unix, UTF-8 check on Windows
            bytes: Vec::from_path_lossy(path),
            hyperlink: OnceCell::new(),
        }
    }

    pub(crate) fn with_separator(mut self, sep: Option<u8>) -> PrinterPath<'a> {
        /// Replace path separators in place
        fn replace_separator(bytes: &[u8], sep: u8) -> Vec<u8> {
            let mut bytes = bytes.to_vec();
            for b in bytes.iter_mut() {
                // Windows uses both / and \, Unix only /
                if *b == b'/' || (cfg!(windows) && *b == b'\\') {
                    *b = sep;
                }
            }
            bytes
        }
        let Some(sep) = sep else { return self };
        // Cow becomes Owned after transformation
        self.bytes = Cow::Owned(replace_separator(self.as_bytes(), sep));
        self
    }

    /// Return this path as a hyperlink (computed lazily)
    pub(crate) fn as_hyperlink(&self) -> Option<&HyperlinkPath> {
        self.hyperlink
            .get_or_init(|| HyperlinkPath::from_path(self.as_path()))
            .as_ref()
    }

    /// Platform-specific path reconstruction
    pub(crate) fn as_path(&self) -> &Path {
        #[cfg(unix)]
        fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
            use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
            // Zero-cost reconstruction from bytes on Unix
            Path::new(OsStr::from_bytes(p.as_bytes()))
        }
        #[cfg(not(unix))]
        fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
            p.path  // Use stored reference on Windows
        }
        imp(self)
    }
}

The Cow<'a, [u8]> type is ideal here: it borrows when no transformation is needed and owns when the separator is replaced, avoiding unnecessary allocation in the common case.


Section 5: Performance-Optimized Number Formatting

/// A simple formatter for converting `u64` values to ASCII byte strings.
///
/// This avoids going through the formatting machinery which seems to
/// substantially slow things down.
#[derive(Debug)]
pub(crate) struct DecimalFormatter {
    buf: [u8; Self::MAX_U64_LEN],  // Fixed-size array on stack
    start: usize,                   // Index where digits begin
}

impl DecimalFormatter {
    /// Discovered via `u64::MAX.to_string().len()`.
    const MAX_U64_LEN: usize = 20;

    pub(crate) fn new(mut n: u64) -> DecimalFormatter {
        let mut buf = [0; Self::MAX_U64_LEN];
        let mut i = buf.len();  // Start from the end
        loop {
            i -= 1;
            // Extract least significant digit
            let digit = u8::try_from(n % 10).unwrap();
            n /= 10;
            buf[i] = b'0' + digit;  // Convert to ASCII
            if n == 0 {
                break;
            }
        }
        DecimalFormatter { buf, start: i }
    }

    /// Return the decimal formatted as an ASCII byte string.
    pub(crate) fn as_bytes(&self) -> &[u8] {
        &self.buf[self.start..]  // Slice from first digit to end
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn custom_decimal_format() {
        let fmt = |n: u64| {
            let bytes = DecimalFormatter::new(n).as_bytes().to_vec();
            String::from_utf8(bytes).unwrap()
        };
        let std = |n: u64| n.to_string();

        let ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 100, 123, u64::MAX];
        for n in ints {
            assert_eq!(std(n), fmt(n));  // Verify against std lib
        }
    }
}

Writing digits from right-to-left avoids the need to reverse the buffer afterward. The start index tracks where valid digits begin in the fixed-size buffer.


Quick Reference

Key Types

Type Purpose
Replacer<M> Amortized-allocation regex replacement
Sunk<'a> Unified match/context abstraction
PrinterPath<'a> Cross-platform path with lazy hyperlinks
DecimalFormatter Zero-allocation number formatting
NiceDuration Human-readable duration display

Conditional Compilation Patterns

// Store extra data only on non-Unix platforms
#[cfg(not(unix))]
path: &'a Path,

// Platform-specific implementations
#[cfg(unix)]
fn imp(...) { /* Unix version */ }
#[cfg(not(unix))]
fn imp(...) { /* Windows version */ }

// Conditional separator handling
if *b == b'/' || (cfg!(windows) && *b == b'\\') { ... }

Memory Optimization Patterns

// Lazy initialization with Option
space: Option<Space<M>>

// Lazy computation with OnceCell
hyperlink: OnceCell<Option<HyperlinkPath>>

// Copy-on-write with Cow
bytes: Cow<'a, [u8]>

// Stack-allocated fixed buffer
buf: [u8; Self::MAX_U64_LEN]

Important Constants

const MAX_LOOK_AHEAD: usize;     // Defined in crate root, limits regex look-ahead
const MAX_U64_LEN: usize = 20;   // Maximum digits in u64::MAX