ripgrep crates/printer/src/json.rs: Code Companion¶
Reference code for the JSON Output lecture. Sections correspond to the lecture document.
Section 1: The Configuration and Builder Pattern¶
/// The configuration for the JSON printer.
///
/// This is manipulated by the JSONBuilder and then referenced by the actual
/// implementation. Once a printer is build, the configuration is frozen and
/// cannot changed.
#[derive(Debug, Clone)]
struct Config {
pretty: bool, // Emit pretty-printed JSON (multi-line)
always_begin_end: bool, // Emit begin/end even with no matches
replacement: Arc<Option<Vec<u8>>>, // Arc for cheap cloning, Option for "no replacement"
}
impl Default for Config {
fn default() -> Config {
Config {
pretty: false,
always_begin_end: false,
replacement: Arc::new(None),
}
}
}
/// A builder for a JSON lines printer.
#[derive(Clone, Debug)]
pub struct JSONBuilder {
config: Config,
}
impl JSONBuilder {
/// Return a new builder for configuring the JSON printer.
pub fn new() -> JSONBuilder {
JSONBuilder { config: Config::default() }
}
/// Create a JSON printer that writes results to the given writer.
pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> {
JSON {
config: self.config.clone(), // Clone severs builder-printer relationship
wtr: CounterWriter::new(wtr),
matches: vec![],
}
}
/// Print JSON in a pretty printed format.
pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder {
self.config.pretty = yes;
self // Return self for method chaining
}
/// When enabled, the `begin` and `end` messages are always emitted.
pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder {
self.config.always_begin_end = yes;
self
}
/// Set the bytes that will be used to replace each occurrence of a match.
pub fn replacement(
&mut self,
replacement: Option<Vec<u8>>,
) -> &mut JSONBuilder {
self.config.replacement = Arc::new(replacement);
self
}
}
The Arc<Option<Vec<u8>>> pattern allows distinguishing "no replacement configured" (None) from "replace with empty string" (Some(vec![])), while the Arc enables cheap cloning when building printers.
Section 2: The JSON Printer Structure¶
#[derive(Clone, Debug)]
pub struct JSON<W> {
config: Config, // Frozen configuration from builder
wtr: CounterWriter<W>, // Tracks bytes written for statistics
matches: Vec<Match>, // Reusable buffer - cleared between uses
}
impl<W: io::Write> JSON<W> {
/// Return a JSON lines printer with a default configuration.
pub fn new(wtr: W) -> JSON<W> {
// Delegate to builder - builder is the canonical construction path
JSONBuilder::new().build(wtr)
}
}
impl<W> JSON<W> {
/// Returns true if and only if this printer has written at least one byte.
pub fn has_written(&self) -> bool {
self.wtr.total_count() > 0
}
/// Return a mutable reference to the underlying writer.
pub fn get_mut(&mut self) -> &mut W {
self.wtr.get_mut()
}
/// Consume this printer and return back ownership of the underlying writer.
pub fn into_inner(self) -> W {
self.wtr.into_inner()
}
}
The matches vector is allocated once and reused across all searches, amortizing allocation costs. The CounterWriter wrapper tracks output volume for statistics reporting.
Section 3: The Sink Abstraction¶
/// An implementation of `Sink` associated with a matcher and an optional file path.
///
/// Lifetime parameters:
/// * `'p` - lifetime of the file path (or `'static` if no path)
/// * `'s` - lifetime of the borrowed JSON printer
#[derive(Debug)]
pub struct JSONSink<'p, 's, M: Matcher, W> {
matcher: M, // For finding match positions within lines
replacer: Replacer<M>, // For optional text substitution
json: &'s mut JSON<W>, // Back-reference to parent printer
path: Option<&'p Path>, // Optional file path for this search
start_time: Instant, // For elapsed time calculation
match_count: u64, // Matches found in current search
binary_byte_offset: Option<u64>, // Where binary data was detected
begin_printed: bool, // Track if begin message was emitted
stats: Stats, // Accumulated statistics
}
impl<W: io::Write> JSON<W> {
/// Return an implementation of `Sink` for the JSON printer.
/// No file path - will never print path with matches.
pub fn sink<'s, M: Matcher>(
&'s mut self,
matcher: M,
) -> JSONSink<'static, 's, M, W> {
JSONSink {
matcher,
replacer: Replacer::new(),
json: self,
path: None, // No path association
start_time: Instant::now(),
match_count: 0,
binary_byte_offset: None,
begin_printed: false,
stats: Stats::new(),
}
}
/// Return an implementation of `Sink` associated with a file path.
pub fn sink_with_path<'p, 's, M, P>(
&'s mut self,
matcher: M,
path: &'p P,
) -> JSONSink<'p, 's, M, W>
where
M: Matcher,
P: ?Sized + AsRef<Path>, // Accept Path, PathBuf, str, etc.
{
JSONSink {
matcher,
replacer: Replacer::new(),
json: self,
path: Some(path.as_ref()), // Convert to &Path
start_time: Instant::now(),
match_count: 0,
binary_byte_offset: None,
begin_printed: false,
stats: Stats::new(),
}
}
}
The ?Sized bound on P allows accepting both Path (unsized) and PathBuf (sized) through the AsRef<Path> trait. The sink borrows the printer mutably, ensuring exclusive access during a search.
Section 4: The Wire Format and Message Types¶
impl<W: io::Write> JSON<W> {
/// Write the given message followed by a new line.
fn write_message(
&mut self,
message: &jsont::Message<'_>,
) -> io::Result<()> {
// Choose compact or pretty output based on configuration
if self.config.pretty {
json::to_writer_pretty(&mut self.wtr, message)?;
} else {
json::to_writer(&mut self.wtr, message)?;
}
// Maintain JSON Lines format invariant: one object per line
let _ = self.wtr.write(b"\n")?;
Ok(())
}
}
impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
/// Write the "begin" message (idempotent - only writes once).
fn write_begin_message(&mut self) -> io::Result<()> {
if self.begin_printed {
return Ok(()); // Already printed, skip
}
let msg = jsont::Message::Begin(jsont::Begin { path: self.path });
self.json.write_message(&msg)?;
self.begin_printed = true;
Ok(())
}
}
// From the Sink trait implementation - the begin callback
fn begin(&mut self, _searcher: &Searcher) -> Result<bool, io::Error> {
self.json.wtr.reset_count(); // Reset byte counter for this search
self.start_time = Instant::now();
self.match_count = 0;
self.binary_byte_offset = None;
// Only emit begin immediately if always_begin_end is configured
if !self.json.config.always_begin_end {
return Ok(true);
}
self.write_begin_message()?;
Ok(true)
}
The message types (Begin, End, Match, Context) are defined in the jsont module and implement Serialize for JSON output. The begin_printed flag prevents duplicate begin messages.
Section 5: Handling Non-UTF-8 Data¶
From the extensive documentation in the source file:
/// ## Text encoding
///
/// JSON may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this
/// printer, we need only worry about UTF-8. The problem here is that searching
/// is not limited to UTF-8 exclusively, which in turn implies that matches
/// may be reported that contain invalid UTF-8.
///
/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal
/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To
/// communicate whether this process occurs or not, strings are keyed by the
/// name `text` where as arbitrary bytes are keyed by `bytes`.
///
/// For example, when a path is included in a message, it is formatted like so,
/// if and only if the path is valid UTF-8:
///
/// ```json
/// {
/// "path": {
/// "text": "/home/ubuntu/lib.rs"
/// }
/// }
/// ```
///
/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte
/// makes it invalid UTF-8, the path would instead be encoded like so:
///
/// ```json
/// {
/// "path": {
/// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM="
/// }
/// }
/// ```
///
/// The printer guarantees that the `text` field is used whenever the
/// underlying bytes are valid UTF-8.
The dual representation (text vs bytes) ensures lossless data transfer: consumers can always recover original bytes either directly from the string or via base64 decoding.
Section 6: Recording Matches Within Lines¶
impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
/// Execute the matcher over the given bytes and record the match
/// locations if the current configuration demands match granularity.
fn record_matches(
&mut self,
searcher: &Searcher,
bytes: &[u8],
range: std::ops::Range<usize>,
) -> io::Result<()> {
self.json.matches.clear(); // Reuse the buffer
let matches = &mut self.json.matches;
// find_iter_at_in_context handles context-aware match finding
find_iter_at_in_context(
searcher,
&self.matcher,
bytes,
range.clone(),
|m| {
// Adjust offsets to be relative to line start, not buffer start
let (s, e) = (m.start() - range.start, m.end() - range.start);
matches.push(Match::new(s, e));
true // Continue finding matches
},
)?;
// Don't report empty matches appearing at the end of the bytes
// (prevents confusing output from patterns like `.*`)
if !matches.is_empty()
&& matches.last().unwrap().is_empty()
&& matches.last().unwrap().start() >= bytes.len()
{
matches.pop().unwrap();
}
Ok(())
}
}
The offset adjustment (m.start() - range.start) normalizes match positions to be relative to the line being reported, not the entire search buffer.
Section 7: The Match and Context Sink Callbacks¶
impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> {
type Error = io::Error;
fn matched(
&mut self,
searcher: &Searcher,
mat: &SinkMatch<'_>,
) -> Result<bool, io::Error> {
self.match_count += 1;
self.write_begin_message()?; // Ensure begin was printed
// Record individual match positions within this line
self.record_matches(
searcher,
mat.buffer(),
mat.bytes_range_in_buffer(),
)?;
// Compute replacement text if configured
self.replace(searcher, mat.buffer(), mat.bytes_range_in_buffer())?;
// Update statistics
self.stats.add_matches(self.json.matches.len() as u64);
self.stats.add_matched_lines(mat.lines().count() as u64);
// Build submatches using small-vector optimization
let submatches = SubMatches::new(
mat.bytes(),
&self.json.matches,
self.replacer.replacement(),
);
let msg = jsont::Message::Match(jsont::Match {
path: self.path,
lines: mat.bytes(),
line_number: mat.line_number(),
absolute_offset: mat.absolute_byte_offset(),
submatches: submatches.as_slice(),
});
self.json.write_message(&msg)?;
Ok(true) // Continue searching
}
fn context(
&mut self,
searcher: &Searcher,
ctx: &SinkContext<'_>,
) -> Result<bool, io::Error> {
self.write_begin_message()?;
self.json.matches.clear();
// Context lines may contain matches when search is inverted
let submatches = if searcher.invert_match() {
self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
self.replace(searcher, ctx.bytes(), 0..ctx.bytes().len())?;
SubMatches::new(
ctx.bytes(),
&self.json.matches,
self.replacer.replacement(),
)
} else {
SubMatches::empty()
};
let msg = jsont::Message::Context(jsont::Context {
path: self.path,
lines: ctx.bytes(),
line_number: ctx.line_number(),
absolute_offset: ctx.absolute_byte_offset(),
submatches: submatches.as_slice(),
});
self.json.write_message(&msg)?;
Ok(true)
}
}
The matched callback handles the common case; context handles contextual lines which may have matches when using inverted search (-v flag).
Section 8: The SubMatches Small-Vector Optimization¶
/// SubMatches represents a set of matches in a contiguous range of bytes.
///
/// Optimizes for the common case of exactly one match per line.
enum SubMatches<'a> {
Empty,
Small([jsont::SubMatch<'a>; 1]), // Fixed array - no allocation
Big(Vec<jsont::SubMatch<'a>>), // Heap allocation for multiple matches
}
impl<'a> SubMatches<'a> {
/// Create a new set of match ranges from matches and corresponding bytes.
fn new(
bytes: &'a [u8],
matches: &[Match],
replacement: Option<(&'a [u8], &'a [Match])>,
) -> SubMatches<'a> {
if matches.len() == 1 {
// Common case: single match - use fixed-size array
let mat = matches[0];
SubMatches::Small([jsont::SubMatch {
m: &bytes[mat],
replacement: replacement
.map(|(rbuf, rmatches)| &rbuf[rmatches[0]]),
start: mat.start(),
end: mat.end(),
}])
} else {
// Rare case: multiple matches - allocate vector
let mut match_ranges = vec![];
for (i, &mat) in matches.iter().enumerate() {
match_ranges.push(jsont::SubMatch {
m: &bytes[mat],
replacement: replacement
.map(|(rbuf, rmatches)| &rbuf[rmatches[i]]),
start: mat.start(),
end: mat.end(),
});
}
SubMatches::Big(match_ranges)
}
}
/// Create an empty set of match ranges.
fn empty() -> SubMatches<'static> {
SubMatches::Empty
}
/// Return this set of match ranges as a slice.
fn as_slice(&self) -> &[jsont::SubMatch<'_>] {
match *self {
SubMatches::Empty => &[],
SubMatches::Small(ref x) => x,
SubMatches::Big(ref x) => x,
}
}
}
This enum-based small-vector optimization avoids heap allocation for the most common case (one match per line), while gracefully handling multiple matches when necessary.
Quick Reference¶
Message Types¶
| Type | Purpose | Key Fields |
|---|---|---|
begin |
Search started | path |
end |
Search finished | path, binary_offset, stats |
match |
Match found | path, lines, line_number, absolute_offset, submatches |
context |
Context line | Same as match |
Arbitrary Data Object¶
Builder Configuration¶
| Method | Default | Purpose |
|---|---|---|
pretty(bool) |
false |
Multi-line JSON output |
always_begin_end(bool) |
false |
Emit begin/end with no matches |
replacement(Option<Vec<u8>>) |
None |
Text substitution pattern |
Key Type Signatures¶
// Create printer
pub fn new(wtr: W) -> JSON<W>
// Create sink for searching
pub fn sink<M: Matcher>(&mut self, matcher: M) -> JSONSink<'static, '_, M, W>
pub fn sink_with_path<M, P>(&mut self, matcher: M, path: &P) -> JSONSink<'_, '_, M, W>
// Sink trait implementation
impl<M: Matcher, W: io::Write> Sink for JSONSink<'_, '_, M, W> {
type Error = io::Error;
fn matched(&mut self, ...) -> Result<bool, io::Error>
fn context(&mut self, ...) -> Result<bool, io::Error>
}