ripgrep crates/regex/src/literal.rs: Code Companion¶
Reference code for the Literal Optimization lecture. Sections correspond to the lecture document.
Section 1: The Inner Literal Strategy¶
/// A type that encapsulates "inner" literal extraction from a regex.
///
/// The main idea underlying the validity of this technique is the fact
/// that ripgrep searches individual lines and not across lines.
#[derive(Clone, Debug)]
pub(crate) struct InnerLiterals {
seq: Seq, // The extracted literal sequence
}
impl InnerLiterals {
pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals {
// Inner literal optimization requires line boundaries to work
if chir.config().line_terminator.is_none() {
log::trace!(
"skipping inner literal extraction, \
no line terminator is set"
);
return InnerLiterals::none();
}
// Defer to regex engine's optimizations unless Unicode word
// boundaries might cause slowdowns
if re.is_accelerated() {
if !chir.hir().properties().look_set().contains_word_unicode() {
log::trace!(
"skipping inner literal extraction, \
existing regex is believed to already be accelerated",
);
return InnerLiterals::none();
}
}
// Pure alternation of literals: regex engine handles this optimally
if chir.hir().properties().is_alternation_literal() {
log::trace!(
"skipping inner literal extraction, \
found alternation of literals, deferring to regex engine",
);
return InnerLiterals::none();
}
let seq = Extractor::new().extract_untagged(chir.hir());
InnerLiterals { seq }
}
/// Returns an infinite set - signals no useful literals found
pub(crate) fn none() -> InnerLiterals {
InnerLiterals { seq: Seq::infinite() }
}
}
The is_accelerated() check queries whether the regex engine has its own internal optimizations. The contains_word_unicode() check identifies patterns with \b in Unicode mode, which can cause the regex engine to fall back to slower paths.
Section 2: The Extractor Architecture¶
/// An inner literal extractor with configurable limits.
#[derive(Debug)]
struct Extractor {
limit_class: usize, // Max chars to expand from character class
limit_repeat: usize, // Max repetition unrolling iterations
limit_literal_len: usize, // Max length of any single literal
limit_total: usize, // Max total literals in final sequence
}
impl Extractor {
fn new() -> Extractor {
Extractor {
limit_class: 10,
limit_repeat: 10,
limit_literal_len: 100,
limit_total: 64,
}
}
/// Main entry point - extracts and post-processes literals
fn extract_untagged(&self, hir: &Hir) -> Seq {
let mut seq = self.extract(hir);
log::trace!("extracted inner literals: {:?}", seq.seq);
// Optimization pass for prefix-based searching
seq.seq.optimize_for_prefix_by_preference();
log::trace!(
"extracted inner literals after optimization: {:?}",
seq.seq
);
// Heuristic check: are these literals actually useful?
if !seq.is_good() {
log::trace!(
"throwing away inner literals because they might be slow"
);
seq.make_infinite();
}
seq.seq
}
/// Recursive extraction dispatches based on HIR node type
fn extract(&self, hir: &Hir) -> TSeq {
use regex_syntax::hir::HirKind::*;
match *hir.kind() {
Empty | Look(_) => TSeq::singleton(Literal::exact(vec![])),
Literal(hir::Literal(ref bytes)) => {
let mut seq =
TSeq::singleton(Literal::exact(bytes.to_vec()));
self.enforce_literal_len(&mut seq);
seq
}
Class(hir::Class::Unicode(ref cls)) => {
self.extract_class_unicode(cls)
}
Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
Repetition(ref rep) => self.extract_repetition(rep),
Capture(hir::Capture { ref sub, .. }) => self.extract(sub),
Concat(ref hirs) => self.extract_concat(hirs.iter()),
Alternation(ref hirs) => self.extract_alternation(hirs.iter()),
}
}
}
The TSeq wrapper ("tagged sequence") tracks whether the sequence represents a prefix position in the regex. This metadata is stripped away in extract_untagged before returning the final Seq.
Section 3: Handling Concatenation¶
/// Extract from concatenation via cross product of sub-expressions
fn extract_concat<'a, I: Iterator<Item = &'a Hir>>(&self, it: I) -> TSeq {
let mut seq = TSeq::singleton(Literal::exact(vec![]));
let mut prev: Option<TSeq> = None; // Tracks best candidate so far
for hir in it {
// Once all literals are inexact, cross product is a no-op
if seq.is_inexact() {
// Empty sequence = impossible match, bail early
if seq.is_empty() {
return seq;
}
// Found something really good? Stop looking
if seq.is_really_good() {
return seq;
}
// Save current best, start fresh for next segment
prev = Some(match prev {
None => seq,
Some(prev) => prev.choose(seq), // Pick better of two
});
seq = TSeq::singleton(Literal::exact(vec![]));
seq.make_not_prefix(); // No longer at pattern start
}
// Cross product combines current seq with next sub-expression
seq = self.cross(seq, self.extract(hir));
}
// Return best of saved candidate and final sequence
if let Some(prev) = prev { prev.choose(seq) } else { seq }
}
The prev variable implements the "keep the best so far" strategy. When the current sequence becomes inexact (hitting a gap like [a-z]+), we save it and start fresh, later choosing whichever is better.
Section 4: Handling Alternation¶
/// Extract from alternation via union of sub-expressions
fn extract_alternation<'a, I: Iterator<Item = &'a Hir>>(
&self,
it: I,
) -> TSeq {
let mut seq = TSeq::empty();
for hir in it {
// Infinite union with anything stays infinite - short circuit
if !seq.is_finite() {
break;
}
seq = self.union(seq, &mut self.extract(hir));
}
seq
}
/// Union with limit enforcement and trimming optimization
fn union(&self, mut seq1: TSeq, seq2: &mut TSeq) -> TSeq {
if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total)
{
// Try trimming to 4 bytes before giving up
// 4 bytes chosen for Teddy algorithm compatibility
seq1.keep_first_bytes(4);
seq2.keep_first_bytes(4);
seq1.dedup();
seq2.dedup();
// Still over limit? Make infinite
if seq1
.max_union_len(seq2)
.map_or(false, |len| len > self.limit_total)
{
seq2.make_infinite();
}
}
seq1.union(seq2);
assert!(seq1.len().map_or(true, |x| x <= self.limit_total));
seq1.prefix = seq1.prefix && seq2.prefix;
seq1
}
The magic number 4 for keep_first_bytes is an abstraction leak from the Teddy SIMD algorithm, which efficiently searches for literals up to 4 bytes. Trimming to this length preserves usefulness even when the full literals would exceed limits.
Section 5: Handling Repetition¶
fn extract_repetition(&self, rep: &hir::Repetition) -> TSeq {
let mut subseq = self.extract(&rep.sub);
match *rep {
// Optional: a? means "a or empty"
hir::Repetition { min: 0, max, greedy, .. } => {
// max=1 preserves exactness (a? = a|"")
if max != Some(1) {
subseq.make_inexact();
}
let mut empty = TSeq::singleton(Literal::exact(vec![]));
// Greedy vs lazy affects union order
if !greedy {
std::mem::swap(&mut subseq, &mut empty);
}
self.union(subseq, &mut empty)
}
// Fixed repetition: a{3} -> "aaa"
hir::Repetition { min, max: Some(max), .. } if min == max => {
assert!(min > 0);
let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
let mut seq = TSeq::singleton(Literal::exact(vec![]));
// Unroll the repetition via repeated cross product
for _ in 0..std::cmp::min(min, limit) {
if seq.is_inexact() {
break;
}
seq = self.cross(seq, subseq.clone());
}
// Exceeded limit means we didn't unroll completely
if usize::try_from(min).is_err() || min > limit {
seq.make_inexact();
}
seq
}
// Bounded range: a{3,5} -> inexact "aaa"
hir::Repetition { min, max: Some(max), .. } if min < max => {
assert!(min > 0);
let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX);
let mut seq = TSeq::singleton(Literal::exact(vec![]));
for _ in 0..std::cmp::min(min, limit) {
if seq.is_inexact() {
break;
}
seq = self.cross(seq, subseq.clone());
}
seq.make_inexact(); // Variable count = always inexact
seq
}
// Unbounded: a+ -> inexact "a"
hir::Repetition { .. } => {
subseq.make_inexact();
subseq
}
}
}
The greedy flag affects the order of alternatives in the union. For a* (greedy), we prefer matching a first; for a*? (lazy), we prefer the empty match first. This ordering propagates to how the literals are used in searching.
Quick Reference¶
TSeq (Tagged Sequence) Key Methods¶
| Method | Purpose |
|---|---|
is_exact() |
All literals are complete matches |
is_inexact() |
At least one literal is partial |
is_finite() |
Has enumerable literals |
is_good() |
Heuristically useful for acceleration |
is_really_good() |
Good enough to short-circuit extraction |
choose(other) |
Pick better sequence via heuristics |
make_not_prefix() |
Mark as not at pattern start |
Heuristic Thresholds¶
// "Good" for acceleration:
min_literal_len >= 2 && count <= 64
// Or if very short literals:
min_literal_len <= 1 && count <= 3
// "Really good" (short-circuit worthy):
min_literal_len >= 3 && count <= 8
Poisonous Literals¶
/// Literals likely to match too frequently
fn is_poisonous(lit: &Literal) -> bool {
use regex_syntax::hir::literal::rank;
// Empty string or high-frequency single byte
lit.is_empty() || (lit.len() == 1 && rank(lit.as_bytes()[0]) >= 250)
}
The rank function returns a frequency score (0-255) based on how commonly a byte appears in typical text. Rank 250+ includes space, newline, and other ubiquitous characters.
Extraction Limits¶
| Limit | Default | Purpose |
|---|---|---|
limit_class |
10 | Max chars from [abc...] |
limit_repeat |
10 | Max unroll iterations for a{n} |
limit_literal_len |
100 | Max bytes per literal |
limit_total |
64 | Max literals in sequence |