Skip to content

Commit

Permalink
Merge pull request #158 from defuz/improve-name-mapping
Browse files Browse the repository at this point in the history
Storing mapping from names to group indices into Regex
  • Loading branch information
BurntSushi committed Feb 19, 2016
2 parents f34213c + 0144613 commit aa124b1
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 67 deletions.
16 changes: 16 additions & 0 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,19 @@ impl<'a> NfaGen<'a> {
None => cx.expr_none(self.sp),
}
);
let named_groups = {
let mut named_groups = ::std::collections::BTreeMap::new();
for (i, name) in self.names.iter().enumerate() {
if let Some(ref name) = *name {
named_groups.insert(name.to_owned(), i);
}
}
self.vec_expr(named_groups.iter(),
&mut |cx, (name, group_idx)|
quote_expr!(cx, ($name, $group_idx))
)
};

let prefix_anchor = self.prog.anchored_begin;

let step_insts = self.step_insts();
Expand All @@ -123,6 +136,8 @@ impl<'a> NfaGen<'a> {
// unused code generated by regex!. See #14185 for an example.
#[allow(dead_code)]
static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
#[allow(dead_code)]
static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;

#[allow(dead_code)]
fn exec<'t>(
Expand Down Expand Up @@ -308,6 +323,7 @@ fn exec<'t>(
::regex::Regex::Native(::regex::internal::ExNative {
original: $regex,
names: &CAP_NAMES,
groups: &NAMED_GROUPS,
prog: exec,
})
})
Expand Down
9 changes: 9 additions & 0 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::collections::HashMap;
use std::sync::Arc;

use backtrack::{self, Backtrack};
use dfa::{self, Dfa, DfaResult};
use input::{ByteInput, CharInput};
Expand Down Expand Up @@ -375,6 +378,12 @@ impl Exec {
&self.prog.cap_names
}

/// Return a reference to named groups mapping (from group name to
/// group position).
pub fn named_groups(&self) -> &Arc<HashMap<String, usize>> {
&self.prog.named_groups
}

/// Return a fresh allocation for storing all possible captures in the
/// underlying regular expression.
pub fn alloc_captures(&self) -> Vec<Option<usize>> {
Expand Down
13 changes: 13 additions & 0 deletions src/program.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::collections::HashMap;
use std::sync::Arc;

use syntax;

use backtrack::BacktrackCache;
Expand Down Expand Up @@ -39,6 +42,9 @@ pub struct Program {
/// The sequence of capture group names. There is an entry for each capture
/// group index and a name exists only if the capture group is named.
pub cap_names: Vec<Option<String>>,
/// The map of named capture groups. The keys are group names and
/// the values are group indices.
pub named_groups: Arc<HashMap<String, usize>>,
/// If the regular expression requires a literal prefix in order to have a
/// match, that prefix is stored here as a DFA.
pub prefixes: Literals,
Expand Down Expand Up @@ -115,10 +121,17 @@ impl ProgramBuilder {
insts.anchored_begin(),
insts.anchored_end(),
);
let mut named_groups = HashMap::new();
for (i, name) in cap_names.iter().enumerate() {
if let Some(ref name) = *name {
named_groups.insert(name.to_owned(), i);
}
}
Ok(Program {
original: self.re,
insts: insts,
cap_names: cap_names,
named_groups: Arc::new(named_groups),
prefixes: prefixes,
anchored_begin: anchored_begin,
anchored_end: anchored_end,
Expand Down
178 changes: 111 additions & 67 deletions src/re.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

use std::borrow::Cow;
use std::collections::HashMap;
use std::collections::hash_map::Iter;
use std::fmt;
use std::ops::Index;
#[cfg(feature = "pattern")]
use std::str::pattern::{Pattern, Searcher, SearchStep};
use std::str::FromStr;
use std::sync::Arc;

use exec::{Exec, ExecBuilder};
use syntax;
Expand Down Expand Up @@ -186,6 +186,8 @@ pub struct ExNative {
#[doc(hidden)]
pub names: &'static &'static [Option<&'static str>],
#[doc(hidden)]
pub groups: &'static &'static [(&'static str, usize)],
#[doc(hidden)]
pub prog: fn(&mut CaptureIdxs, &str, usize) -> bool,
}

Expand Down Expand Up @@ -395,9 +397,13 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `at(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
let mut caps = self.alloc_captures();
if exec(self, &mut caps, text, 0) {
Some(Captures::new(self, text, caps))
let mut locs = self.alloc_captures();
if exec(self, &mut locs, text, 0) {
Some(Captures {
text: text,
locs: locs,
named_groups: NamedGroups::from_regex(self)
})
} else {
None
}
Expand Down Expand Up @@ -804,6 +810,71 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
}
}

enum NamedGroups {
Empty,
Native(&'static [(&'static str, usize)]),
Dynamic(Arc<HashMap<String, usize>>),
}

impl NamedGroups {
fn from_regex(regex: &Regex) -> NamedGroups {
match *regex {
Regex::Native(ExNative { ref groups, .. }) =>
NamedGroups::Native(groups),
Regex::Dynamic(ref exec) => {
let groups = exec.named_groups();
if groups.is_empty() {
NamedGroups::Empty
} else {
NamedGroups::Dynamic(groups.clone())
}
}
}
}

fn pos(&self, name: &str) -> Option<usize> {
match *self {
NamedGroups::Empty => None,
NamedGroups::Native(groups) => {
groups.binary_search_by(|&(n, _)| n.cmp(name))
.ok().map(|i| groups[i].1)
},
NamedGroups::Dynamic(ref groups) => {
groups.get(name).map(|i| *i)
},
}
}

fn iter<'n>(&'n self) -> NamedGroupsIter<'n> {
match *self {
NamedGroups::Empty => NamedGroupsIter::Empty,
NamedGroups::Native(g) => NamedGroupsIter::Native(g.iter()),
NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()),
}
}
}

enum NamedGroupsIter<'n> {
Empty,
Native(::std::slice::Iter<'static, (&'static str, usize)>),
Dynamic(::std::collections::hash_map::Iter<'n, String, usize>),
}

impl<'n> Iterator for NamedGroupsIter<'n> {
type Item = (&'n str, usize);

fn next(&mut self) -> Option<Self::Item> {
match *self {
NamedGroupsIter::Empty =>
None,
NamedGroupsIter::Native(ref mut it) =>
it.next().map(|&v| v),
NamedGroupsIter::Dynamic(ref mut it) =>
it.next().map(|(s, i)| (s.as_ref(), *i))
}
}
}

/// Captures represents a group of captured strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
Expand All @@ -818,34 +889,10 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
pub struct Captures<'t> {
text: &'t str,
locs: Vec<Option<usize>>,
named: Option<HashMap<String, usize>>,
named_groups: NamedGroups,
}

impl<'t> Captures<'t> {
fn new(
re: &Regex,
search: &'t str,
locs: Vec<Option<usize>>,
) -> Captures<'t> {
let named =
if re.captures_len() == 0 {
None
} else {
let mut named = HashMap::new();
for (i, name) in re.capture_names().enumerate() {
if let Some(name) = name {
named.insert(name.to_owned(), i);
}
}
Some(named)
};
Captures {
text: search,
locs: locs,
named: named,
}
}

/// Returns the start and end positions of the Nth capture group.
/// Returns `None` if `i` is not a valid capture group or if the capture
/// group did not match anything.
Expand Down Expand Up @@ -874,15 +921,7 @@ impl<'t> Captures<'t> {
/// `name` isn't a valid capture group or didn't match anything, then
/// `None` is returned.
pub fn name(&self, name: &str) -> Option<&'t str> {
match self.named {
None => None,
Some(ref h) => {
match h.get(name) {
None => None,
Some(i) => self.at(*i),
}
}
}
self.named_groups.pos(name).and_then(|i| self.at(i))
}

/// Creates an iterator of all the capture groups in order of appearance
Expand All @@ -895,7 +934,7 @@ impl<'t> Captures<'t> {
/// appearance in the regular expression. Positions are byte indices
/// in terms of the original string matched.
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
SubCapturesPos { idx: 0, caps: self, }
SubCapturesPos { idx: 0, locs: &self.locs }
}

/// Creates an iterator of all named groups as an tuple with the group
Expand All @@ -904,7 +943,7 @@ impl<'t> Captures<'t> {
pub fn iter_named(&'t self) -> SubCapturesNamed<'t> {
SubCapturesNamed {
caps: self,
inner: self.named.as_ref().map(|n| n.iter()),
names: self.named_groups.iter()
}
}

Expand Down Expand Up @@ -978,16 +1017,16 @@ impl<'t> Index<&'t str> for Captures<'t> {
/// An iterator over capture groups for a particular match of a regular
/// expression.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCaptures<'t> {
/// `'c` is the lifetime of the captures.
pub struct SubCaptures<'c> {
idx: usize,
caps: &'t Captures<'t>,
caps: &'c Captures<'c>,
}

impl<'t> Iterator for SubCaptures<'t> {
type Item = Option<&'t str>;
impl<'c> Iterator for SubCaptures<'c> {
type Item = Option<&'c str>;

fn next(&mut self) -> Option<Option<&'t str>> {
fn next(&mut self) -> Option<Option<&'c str>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.at(self.idx - 1))
Expand All @@ -1002,42 +1041,43 @@ impl<'t> Iterator for SubCaptures<'t> {
///
/// Positions are byte indices in terms of the original string matched.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCapturesPos<'t> {
/// `'c` is the lifetime of the captures.
pub struct SubCapturesPos<'c> {
idx: usize,
caps: &'t Captures<'t>,
locs: &'c [Option<usize>]
}

impl<'t> Iterator for SubCapturesPos<'t> {
impl<'c> Iterator for SubCapturesPos<'c> {
type Item = Option<(usize, usize)>;

fn next(&mut self) -> Option<Option<(usize, usize)>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.pos(self.idx - 1))
} else {
None
if self.idx >= self.locs.len() {
return None
}
let r = match (self.locs[self.idx], self.locs[self.idx + 1]) {
(Some(s), Some(e)) => Some((s, e)),
(None, None) => None,
_ => unreachable!()
};
self.idx += 2;
Some(r)
}
}

/// An Iterator over named capture groups as a tuple with the group
/// name and the value.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCapturesNamed<'t>{
caps: &'t Captures<'t>,
inner: Option<Iter<'t, String, usize>>,
/// `'c` is the lifetime of the captures.
pub struct SubCapturesNamed<'c> {
caps: &'c Captures<'c>,
names: NamedGroupsIter<'c>,
}

impl<'t> Iterator for SubCapturesNamed<'t> {
type Item = (&'t str, Option<&'t str>);
impl<'c> Iterator for SubCapturesNamed<'c> {
type Item = (&'c str, Option<&'c str>);

fn next(&mut self) -> Option<(&'t str, Option<&'t str>)> {
match self.inner.as_mut().map_or(None, |it| it.next()) {
Some((name, pos)) => Some((name, self.caps.at(*pos))),
None => None
}
fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> {
self.names.next().map(|(name, pos)| (name, self.caps.at(pos)))
}
}

Expand Down Expand Up @@ -1081,7 +1121,11 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> {
}
self.last_end = e;
self.last_match = Some(self.last_end);
Some(Captures::new(self.re, self.search, caps))
Some(Captures {
text: self.search,
locs: caps,
named_groups: NamedGroups::from_regex(self.re),
})
}
}

Expand Down

0 comments on commit aa124b1

Please sign in to comment.