ripgrep crates/printer/src/util.rs: Code Companion¶
Reference code for the Printer Utilities lecture. Sections correspond to the lecture document.
Section 1: The Replacer Pattern for Amortized Allocation¶
/// A type for handling replacements while amortizing allocation.
pub(crate) struct Replacer<M: Matcher> {
// Option allows lazy initialization - None until first use
space: Option<Space<M>>,
}
struct Space<M: Matcher> {
/// The place to store capture locations.
caps: M::Captures, // Generic over matcher's capture type
/// The place to write a replacement to.
dst: Vec<u8>,
/// The place to store match offsets in terms of `dst`.
matches: Vec<Match>,
}
impl<M: Matcher> Replacer<M> {
/// Create a new replacer for use with a particular matcher.
///
/// This constructor does not allocate. Instead, space for dealing with
/// replacements is allocated lazily only when needed.
pub(crate) fn new() -> Replacer<M> {
Replacer { space: None } // Zero allocation at construction
}
/// Allocate space for replacements when used with the given matcher and
/// return a mutable reference to that space.
fn allocate(&mut self, matcher: &M) -> io::Result<&mut Space<M>> {
if self.space.is_none() {
// First allocation happens here - captures come from matcher
let caps =
matcher.new_captures().map_err(io::Error::error_message)?;
self.space = Some(Space { caps, dst: vec![], matches: vec![] });
}
Ok(self.space.as_mut().unwrap())
}
/// Clear space used for performing a replacement.
pub(crate) fn clear(&mut self) {
if let Some(ref mut space) = self.space {
// Clear contents but retain allocated capacity
space.dst.clear();
space.matches.clear();
}
}
}
The M::Captures associated type allows each matcher implementation to define its own capture storage format, maintaining type safety without runtime dispatch.
Section 2: Multi-Line Mode and the Look-Ahead Problem¶
pub(crate) fn replace_all<'a>(
&'a mut self,
searcher: &Searcher,
matcher: &M,
mut haystack: &[u8],
range: std::ops::Range<usize>,
replacement: &[u8],
) -> io::Result<()> {
// Detect if we're in multi-line mode to handle look-ahead correctly
let is_multi_line = searcher.multi_line_with_matcher(&matcher);
// Get the line_terminator that was removed (if any) so we can add it back
let line_terminator = if is_multi_line {
// In multi-line mode: extend buffer for look-ahead
if haystack[range.end..].len() >= MAX_LOOK_AHEAD {
haystack = &haystack[..range.end + MAX_LOOK_AHEAD];
}
&[] // No terminator removed
} else {
// In single-line mode: remove terminator to prevent false negatives
let mut m = Match::new(0, range.end);
let line_terminator =
trim_line_terminator(searcher, haystack, &mut m);
haystack = &haystack[..m.end()];
line_terminator // Save to restore later
};
// ... replacement logic, then restore line_terminator at the end
}
/// Given a buf and some bounds, if there is a line terminator at the end of
/// the given bounds in buf, then the bounds are trimmed to remove the line
/// terminator, returning the slice of the removed line terminator (if any).
pub(crate) fn trim_line_terminator<'b>(
searcher: &Searcher,
buf: &'b [u8],
line: &mut Match,
) -> &'b [u8] {
let lineterm = searcher.line_terminator();
if lineterm.is_suffix(&buf[*line]) {
let mut end = line.end() - 1;
// Handle CRLF as a two-byte terminator
if lineterm.is_crlf() && end > 0 && buf.get(end - 1) == Some(&b'\r') {
end -= 1;
}
let orig_end = line.end();
*line = line.with_end(end);
&buf[end..orig_end] // Return the trimmed terminator
} else {
&[]
}
}
The MAX_LOOK_AHEAD constant (defined elsewhere) caps how much extra context is provided, preventing unbounded buffer growth while still supporting most practical look-ahead patterns.
Section 3: The Sunk Abstraction for Unified Match Handling¶
/// A simple layer of abstraction over either a match or a contextual line
/// reported by the searcher.
#[derive(Debug)]
pub(crate) struct Sunk<'a> {
bytes: &'a [u8],
absolute_byte_offset: u64,
line_number: Option<u64>,
context_kind: Option<&'a SinkContextKind>, // None for matches, Some for context
matches: &'a [Match], // Positions in (possibly replaced) output
original_matches: &'a [Match], // Positions in original text
}
impl<'a> Sunk<'a> {
#[inline]
pub(crate) fn from_sink_match(
sunk: &'a SinkMatch<'a>,
original_matches: &'a [Match],
replacement: Option<(&'a [u8], &'a [Match])>,
) -> Sunk<'a> {
// Use replacement data if present, otherwise original
let (bytes, matches) =
replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
Sunk {
bytes,
absolute_byte_offset: sunk.absolute_byte_offset(),
line_number: sunk.line_number(),
context_kind: None, // This is a match, not context
matches,
original_matches,
}
}
#[inline]
pub(crate) fn from_sink_context(
sunk: &'a SinkContext<'a>,
original_matches: &'a [Match],
replacement: Option<(&'a [u8], &'a [Match])>,
) -> Sunk<'a> {
let (bytes, matches) =
replacement.unwrap_or_else(|| (sunk.bytes(), original_matches));
Sunk {
bytes,
absolute_byte_offset: sunk.absolute_byte_offset(),
line_number: sunk.line_number(),
context_kind: Some(sunk.kind()), // Before, After, or Passthrough
matches,
original_matches,
}
}
/// Iterate over lines using the given terminator byte
#[inline]
pub(crate) fn lines(&self, line_term: u8) -> LineIter<'a> {
LineIter::new(line_term, self.bytes())
}
}
The 'a lifetime ties all borrowed data to the same scope, ensuring the Sunk value cannot outlive the search results it references.
Section 4: Cross-Platform Path Handling¶
#[derive(Clone, Debug)]
pub(crate) struct PrinterPath<'a> {
// Only stored on Windows - Unix can reconstruct from bytes
#[cfg(not(unix))]
path: &'a Path,
bytes: Cow<'a, [u8]>, // Borrowed when possible, owned after transforms
hyperlink: OnceCell<Option<HyperlinkPath>>, // Lazy, computed once
}
impl<'a> PrinterPath<'a> {
pub(crate) fn new(path: &'a Path) -> PrinterPath<'a> {
PrinterPath {
#[cfg(not(unix))]
path,
// Zero-cost on Unix, UTF-8 check on Windows
bytes: Vec::from_path_lossy(path),
hyperlink: OnceCell::new(),
}
}
pub(crate) fn with_separator(mut self, sep: Option<u8>) -> PrinterPath<'a> {
/// Replace path separators in place
fn replace_separator(bytes: &[u8], sep: u8) -> Vec<u8> {
let mut bytes = bytes.to_vec();
for b in bytes.iter_mut() {
// Windows uses both / and \, Unix only /
if *b == b'/' || (cfg!(windows) && *b == b'\\') {
*b = sep;
}
}
bytes
}
let Some(sep) = sep else { return self };
// Cow becomes Owned after transformation
self.bytes = Cow::Owned(replace_separator(self.as_bytes(), sep));
self
}
/// Return this path as a hyperlink (computed lazily)
pub(crate) fn as_hyperlink(&self) -> Option<&HyperlinkPath> {
self.hyperlink
.get_or_init(|| HyperlinkPath::from_path(self.as_path()))
.as_ref()
}
/// Platform-specific path reconstruction
pub(crate) fn as_path(&self) -> &Path {
#[cfg(unix)]
fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
use std::{ffi::OsStr, os::unix::ffi::OsStrExt};
// Zero-cost reconstruction from bytes on Unix
Path::new(OsStr::from_bytes(p.as_bytes()))
}
#[cfg(not(unix))]
fn imp<'p>(p: &'p PrinterPath<'_>) -> &'p Path {
p.path // Use stored reference on Windows
}
imp(self)
}
}
The Cow<'a, [u8]> type is ideal here: it borrows when no transformation is needed and owns when the separator is replaced, avoiding unnecessary allocation in the common case.
Section 5: Performance-Optimized Number Formatting¶
/// A simple formatter for converting `u64` values to ASCII byte strings.
///
/// This avoids going through the formatting machinery which seems to
/// substantially slow things down.
#[derive(Debug)]
pub(crate) struct DecimalFormatter {
buf: [u8; Self::MAX_U64_LEN], // Fixed-size array on stack
start: usize, // Index where digits begin
}
impl DecimalFormatter {
/// Discovered via `u64::MAX.to_string().len()`.
const MAX_U64_LEN: usize = 20;
pub(crate) fn new(mut n: u64) -> DecimalFormatter {
let mut buf = [0; Self::MAX_U64_LEN];
let mut i = buf.len(); // Start from the end
loop {
i -= 1;
// Extract least significant digit
let digit = u8::try_from(n % 10).unwrap();
n /= 10;
buf[i] = b'0' + digit; // Convert to ASCII
if n == 0 {
break;
}
}
DecimalFormatter { buf, start: i }
}
/// Return the decimal formatted as an ASCII byte string.
pub(crate) fn as_bytes(&self) -> &[u8] {
&self.buf[self.start..] // Slice from first digit to end
}
}
#[cfg(test)]
mod tests {
#[test]
fn custom_decimal_format() {
let fmt = |n: u64| {
let bytes = DecimalFormatter::new(n).as_bytes().to_vec();
String::from_utf8(bytes).unwrap()
};
let std = |n: u64| n.to_string();
let ints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 100, 123, u64::MAX];
for n in ints {
assert_eq!(std(n), fmt(n)); // Verify against std lib
}
}
}
Writing digits from right-to-left avoids the need to reverse the buffer afterward. The start index tracks where valid digits begin in the fixed-size buffer.
Quick Reference¶
Key Types¶
| Type | Purpose |
|---|---|
Replacer<M> |
Amortized-allocation regex replacement |
Sunk<'a> |
Unified match/context abstraction |
PrinterPath<'a> |
Cross-platform path with lazy hyperlinks |
DecimalFormatter |
Zero-allocation number formatting |
NiceDuration |
Human-readable duration display |
Conditional Compilation Patterns¶
// Store extra data only on non-Unix platforms
#[cfg(not(unix))]
path: &'a Path,
// Platform-specific implementations
#[cfg(unix)]
fn imp(...) { /* Unix version */ }
#[cfg(not(unix))]
fn imp(...) { /* Windows version */ }
// Conditional separator handling
if *b == b'/' || (cfg!(windows) && *b == b'\\') { ... }
Memory Optimization Patterns¶
// Lazy initialization with Option
space: Option<Space<M>>
// Lazy computation with OnceCell
hyperlink: OnceCell<Option<HyperlinkPath>>
// Copy-on-write with Cow
bytes: Cow<'a, [u8]>
// Stack-allocated fixed buffer
buf: [u8; Self::MAX_U64_LEN]