Skip to content

Commit 2918aab

Browse files
committed
regex: expose lower level search APIs
This commit exposes two new areas of API surface: 1. A new `captures_read` method which provides a way to access the offsets of submatches while amortizing the allocation of the space required to store those offsets. Callers should still of course prefer to use the higher level `captures` method, but if performance dictates, this lower level API may be useful. 2. New "at" variants of shortest_match/is_match/find/captures/captures_read that permit controlling where the start of a search begins within a slice. This is typically useful for controlling the match semantics of look-around operators such as `^` and `$`, and are necessary for implementing non-overlapping iterators. Fixes #219
1 parent 607ac5c commit 2918aab

File tree

7 files changed

+243
-49
lines changed

7 files changed

+243
-49
lines changed

Diff for: ci/script.sh

+1-3
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@ cargo test --verbose --manifest-path regex-syntax/Cargo.toml
3030
cargo doc --verbose --manifest-path regex-syntax/Cargo.toml
3131

3232
# Run tests on regex-capi crate.
33-
cargo build --verbose --manifest-path regex-capi/Cargo.toml
34-
(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
35-
(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)
33+
ci/test-regex-capi
3634

3735
# Make sure benchmarks compile. Don't run them though because they take a
3836
# very long time. Also, check that we can build the regex-debug tool.

Diff for: ci/test-regex-capi

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/sh
2+
3+
set -e
4+
5+
cargo build --verbose --manifest-path regex-capi/Cargo.toml
6+
(cd regex-capi/ctest && ./compile && LD_LIBRARY_PATH=../../target/debug ./test)
7+
(cd regex-capi/examples && ./compile && LD_LIBRARY_PATH=../../target/debug ./iter)

Diff for: src/exec.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use prog::Program;
2929
use re_builder::RegexOptions;
3030
use re_bytes;
3131
use re_set;
32-
use re_trait::{RegularExpression, Slot, Locations, as_slots};
32+
use re_trait::{RegularExpression, Slot, Locations};
3333
use re_unicode;
3434
use utf8::next_utf8;
3535

@@ -359,13 +359,13 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
359359
}
360360

361361
#[inline(always)] // reduces constant overhead
362-
fn read_captures_at(
362+
fn captures_read_at(
363363
&self,
364364
locs: &mut Locations,
365365
text: &str,
366366
start: usize,
367367
) -> Option<(usize, usize)> {
368-
self.0.read_captures_at(locs, text.as_bytes(), start)
368+
self.0.captures_read_at(locs, text.as_bytes(), start)
369369
}
370370
}
371371

@@ -528,13 +528,13 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
528528
///
529529
/// Note that the first two slots always correspond to the start and end
530530
/// locations of the overall match.
531-
fn read_captures_at(
531+
fn captures_read_at(
532532
&self,
533533
locs: &mut Locations,
534534
text: &[u8],
535535
start: usize,
536536
) -> Option<(usize, usize)> {
537-
let slots = as_slots(locs);
537+
let slots = locs.as_slots();
538538
for slot in slots.iter_mut() {
539539
*slot = None;
540540
}

Diff for: src/lib.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -542,11 +542,11 @@ pub use re_builder::set_unicode::*;
542542
#[cfg(feature = "use_std")]
543543
pub use re_set::unicode::*;
544544
#[cfg(feature = "use_std")]
545-
pub use re_trait::Locations;
546545
#[cfg(feature = "use_std")]
547546
pub use re_unicode::{
548547
Regex, Match, Captures,
549548
CaptureNames, Matches, CaptureMatches, SubCaptureMatches,
549+
CaptureLocations, Locations,
550550
Replacer, ReplacerRef, NoExpand, Split, SplitN,
551551
escape,
552552
};
@@ -644,7 +644,6 @@ pub mod bytes {
644644
pub use re_builder::set_bytes::*;
645645
pub use re_bytes::*;
646646
pub use re_set::bytes::*;
647-
pub use re_trait::Locations;
648647
}
649648

650649
mod backtrack;

Diff for: src/re_bytes.rs

+112-16
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use exec::{Exec, ExecNoSync};
2121
use expand::expand_bytes;
2222
use error::Error;
2323
use re_builder::bytes::RegexBuilder;
24-
use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter};
24+
use re_trait::{self, RegularExpression, SubCapturesPosIter};
2525

2626
/// Match represents a single match of a regex in a haystack.
2727
///
@@ -252,10 +252,10 @@ impl Regex {
252252
/// The `0`th capture group is always unnamed, so it must always be
253253
/// accessed with `get(0)` or `[0]`.
254254
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
255-
let mut locs = self.locations();
256-
self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
255+
let mut locs = self.capture_locations();
256+
self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
257257
text: text,
258-
locs: locs,
258+
locs: locs.0,
259259
named_groups: self.0.capture_name_idx().clone(),
260260
})
261261
}
@@ -568,7 +568,6 @@ impl Regex {
568568
/// The significance of the starting point is that it takes the surrounding
569569
/// context into consideration. For example, the `\A` anchor can only
570570
/// match when `start == 0`.
571-
#[doc(hidden)]
572571
pub fn shortest_match_at(
573572
&self,
574573
text: &[u8],
@@ -583,7 +582,6 @@ impl Regex {
583582
/// The significance of the starting point is that it takes the surrounding
584583
/// context into consideration. For example, the `\A` anchor can only
585584
/// match when `start == 0`.
586-
#[doc(hidden)]
587585
pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
588586
self.shortest_match_at(text, start).is_some()
589587
}
@@ -594,7 +592,6 @@ impl Regex {
594592
/// The significance of the starting point is that it takes the surrounding
595593
/// context into consideration. For example, the `\A` anchor can only
596594
/// match when `start == 0`.
597-
#[doc(hidden)]
598595
pub fn find_at<'t>(
599596
&self,
600597
text: &'t [u8],
@@ -604,21 +601,55 @@ impl Regex {
604601
.map(|(s, e)| Match::new(text, s, e))
605602
}
606603

607-
/// Returns the same as captures, but starts the search at the given
604+
/// This is like `captures`, but uses
605+
/// [`CaptureLocations`](struct.CaptureLocations.html)
606+
/// instead of
607+
/// [`Captures`](struct.Captures.html) in order to amortize allocations.
608+
///
609+
/// To create a `CaptureLocations` value, use the
610+
/// `Regex::capture_locations` method.
611+
///
612+
/// This returns the overall match if this was successful, which is always
613+
/// equivalence to the `0`th capture group.
614+
pub fn captures_read<'t>(
615+
&self,
616+
locs: &mut CaptureLocations,
617+
text: &'t [u8],
618+
) -> Option<Match<'t>> {
619+
self.captures_read_at(locs, text, 0)
620+
}
621+
622+
/// Returns the same as `captures_read`, but starts the search at the given
608623
/// offset and populates the capture locations given.
609624
///
610625
/// The significance of the starting point is that it takes the surrounding
611626
/// context into consideration. For example, the `\A` anchor can only
612627
/// match when `start == 0`.
628+
pub fn captures_read_at<'t>(
629+
&self,
630+
locs: &mut CaptureLocations,
631+
text: &'t [u8],
632+
start: usize,
633+
) -> Option<Match<'t>> {
634+
self.0
635+
.searcher()
636+
.captures_read_at(&mut locs.0, text, start)
637+
.map(|(s, e)| Match::new(text, s, e))
638+
}
639+
640+
/// An undocumented alias for `captures_read_at`.
641+
///
642+
/// The `regex-capi` crate previously used this routine, so to avoid
643+
/// breaking that crate, we continue to provide the name as an undocumented
644+
/// alias.
613645
#[doc(hidden)]
614646
pub fn read_captures_at<'t>(
615647
&self,
616-
locs: &mut Locations,
648+
locs: &mut CaptureLocations,
617649
text: &'t [u8],
618650
start: usize,
619651
) -> Option<Match<'t>> {
620-
self.0.searcher().read_captures_at(locs, text, start)
621-
.map(|(s, e)| Match::new(text, s, e))
652+
self.captures_read_at(locs, text, start)
622653
}
623654
}
624655

@@ -639,11 +670,19 @@ impl Regex {
639670
self.0.capture_names().len()
640671
}
641672

642-
/// Returns an empty set of locations that can be reused in multiple calls
643-
/// to `read_captures`.
673+
/// Returns an empty set of capture locations that can be reused in
674+
/// multiple calls to `captures_read` or `captures_read_at`.
675+
pub fn capture_locations(&self) -> CaptureLocations {
676+
CaptureLocations(self.0.searcher().locations())
677+
}
678+
679+
/// An alias for `capture_locations` to preserve backward compatibility.
680+
///
681+
/// The `regex-capi` crate uses this method, so to avoid breaking that
682+
/// crate, we continue to export it as an undocumented API.
644683
#[doc(hidden)]
645-
pub fn locations(&self) -> Locations {
646-
self.0.searcher().locations()
684+
pub fn locations(&self) -> CaptureLocations {
685+
CaptureLocations(self.0.searcher().locations())
647686
}
648687
}
649688

@@ -769,6 +808,63 @@ impl<'r> Iterator for CaptureNames<'r> {
769808
}
770809
}
771810

811+
/// CaptureLocations is a low level representation of the raw offsets of each
812+
/// submatch.
813+
///
814+
/// You can think of this as a lower level
815+
/// [`Captures`](struct.Captures.html), where this type does not support
816+
/// named capturing groups directly and it does not borrow the text that these
817+
/// offsets were matched on.
818+
///
819+
/// Primarily, this type is useful when using the lower level `Regex` APIs
820+
/// such as `read_captures`, which permits amortizing the allocation in which
821+
/// capture match locations are stored.
822+
///
823+
/// In order to build a value of this type, you'll need to call the
824+
/// `capture_locations` method on the `Regex` being used to execute the search.
825+
/// The value returned can then be reused in subsequent searches.
826+
#[derive(Clone, Debug)]
827+
pub struct CaptureLocations(re_trait::Locations);
828+
829+
/// A type alias for `CaptureLocations` for backwards compatibility.
830+
///
831+
/// Previously, we exported `CaptureLocations` as `Locations` in an
832+
/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`),
833+
/// we continue re-exporting the same undocumented API.
834+
#[doc(hidden)]
835+
pub type Locations = CaptureLocations;
836+
837+
impl CaptureLocations {
838+
/// Returns the start and end positions of the Nth capture group. Returns
839+
/// `None` if `i` is not a valid capture group or if the capture group did
840+
/// not match anything. The positions returned are *always* byte indices
841+
/// with respect to the original string matched.
842+
#[inline]
843+
pub fn get(&self, i: usize) -> Option<(usize, usize)> {
844+
self.0.pos(i)
845+
}
846+
847+
/// Returns the total number of capturing groups.
848+
///
849+
/// This is always at least `1` since every regex has at least `1`
850+
/// capturing group that corresponds to the entire match.
851+
#[inline]
852+
pub fn len(&self) -> usize {
853+
self.0.len()
854+
}
855+
856+
/// An alias for the `get` method for backwards compatibility.
857+
///
858+
/// Previously, we exported `get` as `pos` in an undocumented API. To
859+
/// prevent breaking that code (e.g., in `regex-capi`), we continue
860+
/// re-exporting the same undocumented API.
861+
#[doc(hidden)]
862+
#[inline]
863+
pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
864+
self.get(i)
865+
}
866+
}
867+
772868
/// Captures represents a group of captured byte strings for a single match.
773869
///
774870
/// The 0th capture always corresponds to the entire match. Each subsequent
@@ -782,7 +878,7 @@ impl<'r> Iterator for CaptureNames<'r> {
782878
/// `'t` is the lifetime of the matched text.
783879
pub struct Captures<'t> {
784880
text: &'t [u8],
785-
locs: Locations,
881+
locs: re_trait::Locations,
786882
named_groups: Arc<HashMap<String, usize>>,
787883
}
788884

Diff for: src/re_trait.rs

+7-7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub type Slot = Option<usize>;
1818
///
1919
/// Unlike `Captures`, a `Locations` value only stores offsets.
2020
#[doc(hidden)]
21+
#[derive(Clone, Debug)]
2122
pub struct Locations(Vec<Slot>);
2223

2324
impl Locations {
@@ -47,12 +48,11 @@ impl Locations {
4748
pub fn len(&self) -> usize {
4849
self.0.len() / 2
4950
}
50-
}
5151

52-
/// This is a hack to make Locations -> &mut [Slot] be available internally
53-
/// without exposing it in the public API.
54-
pub fn as_slots(locs: &mut Locations) -> &mut [Slot] {
55-
&mut locs.0
52+
/// Return the individual slots as a slice.
53+
pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
54+
&mut self.0
55+
}
5656
}
5757

5858
/// An iterator over capture group positions for a particular match of a
@@ -139,7 +139,7 @@ pub trait RegularExpression: Sized {
139139

140140
/// Returns the leftmost-first match location if one exists, and also
141141
/// fills in any matching capture slot locations.
142-
fn read_captures_at(
142+
fn captures_read_at(
143143
&self,
144144
locs: &mut Locations,
145145
text: &Self::Text,
@@ -246,7 +246,7 @@ impl<'t, R> Iterator for CaptureMatches<'t, R>
246246
return None
247247
}
248248
let mut locs = self.0.re.locations();
249-
let (s, e) = match self.0.re.read_captures_at(
249+
let (s, e) = match self.0.re.captures_read_at(
250250
&mut locs,
251251
self.0.text,
252252
self.0.last_end,

0 commit comments

Comments
 (0)