Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Storing mapping from names to group indices into Regex #158

Merged
merged 5 commits into from
Feb 19, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions regex_macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -109,6 +109,19 @@ impl<'a> NfaGen<'a> {
None => cx.expr_none(self.sp),
}
);
let named_groups = {
let mut named_groups = ::std::collections::BTreeMap::new();
for (i, name) in self.names.iter().enumerate() {
if let Some(ref name) = *name {
named_groups.insert(name.to_owned(), i);
}
}
self.vec_expr(named_groups.iter(),
&mut |cx, (name, group_idx)|
quote_expr!(cx, ($name, $group_idx))
)
};

let prefix_anchor = self.prog.anchored_begin;

let step_insts = self.step_insts();
@@ -123,6 +136,8 @@ impl<'a> NfaGen<'a> {
// unused code generated by regex!. See #14185 for an example.
#[allow(dead_code)]
static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
#[allow(dead_code)]
static NAMED_GROUPS: &'static [(&'static str, usize)] = &$named_groups;

#[allow(dead_code)]
fn exec<'t>(
@@ -308,6 +323,7 @@ fn exec<'t>(
::regex::Regex::Native(::regex::internal::ExNative {
original: $regex,
names: &CAP_NAMES,
groups: &NAMED_GROUPS,
prog: exec,
})
})
9 changes: 9 additions & 0 deletions src/exec.rs
Original file line number Diff line number Diff line change
@@ -8,6 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::collections::HashMap;
use std::sync::Arc;

use backtrack::{self, Backtrack};
use dfa::{self, Dfa, DfaResult};
use input::{ByteInput, CharInput};
@@ -375,6 +378,12 @@ impl Exec {
&self.prog.cap_names
}

/// Return a reference to named groups mapping (from group name to
/// group position).
pub fn named_groups(&self) -> &Arc<HashMap<String, usize>> {
&self.prog.named_groups
}

/// Return a fresh allocation for storing all possible captures in the
/// underlying regular expression.
pub fn alloc_captures(&self) -> Vec<Option<usize>> {
13 changes: 13 additions & 0 deletions src/program.rs
Original file line number Diff line number Diff line change
@@ -8,6 +8,9 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::collections::HashMap;
use std::sync::Arc;

use syntax;

use backtrack::BacktrackCache;
@@ -39,6 +42,9 @@ pub struct Program {
/// The sequence of capture group names. There is an entry for each capture
/// group index and a name exists only if the capture group is named.
pub cap_names: Vec<Option<String>>,
/// The map of named capture groups. The keys are group names and
/// the values are group indices.
pub named_groups: Arc<HashMap<String, usize>>,
/// If the regular expression requires a literal prefix in order to have a
/// match, that prefix is stored here as a DFA.
pub prefixes: Literals,
@@ -115,10 +121,17 @@ impl ProgramBuilder {
insts.anchored_begin(),
insts.anchored_end(),
);
let mut named_groups = HashMap::new();
for (i, name) in cap_names.iter().enumerate() {
if let Some(ref name) = *name {
named_groups.insert(name.to_owned(), i);
}
}
Ok(Program {
original: self.re,
insts: insts,
cap_names: cap_names,
named_groups: Arc::new(named_groups),
prefixes: prefixes,
anchored_begin: anchored_begin,
anchored_end: anchored_end,
178 changes: 111 additions & 67 deletions src/re.rs
Original file line number Diff line number Diff line change
@@ -10,12 +10,12 @@

use std::borrow::Cow;
use std::collections::HashMap;
use std::collections::hash_map::Iter;
use std::fmt;
use std::ops::Index;
#[cfg(feature = "pattern")]
use std::str::pattern::{Pattern, Searcher, SearchStep};
use std::str::FromStr;
use std::sync::Arc;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small nit, but I try to order imports by:

  1. Category. (Group by std, then group by third party crates and then group by internal imports.)
  2. Alphabetical.

use exec::{Exec, ExecBuilder};
use syntax;
@@ -186,6 +186,8 @@ pub struct ExNative {
#[doc(hidden)]
pub names: &'static &'static [Option<&'static str>],
#[doc(hidden)]
pub groups: &'static &'static [(&'static str, usize)],
#[doc(hidden)]
pub prog: fn(&mut CaptureIdxs, &str, usize) -> bool,
}

@@ -395,9 +397,13 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `at(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
let mut caps = self.alloc_captures();
if exec(self, &mut caps, text, 0) {
Some(Captures::new(self, text, caps))
let mut locs = self.alloc_captures();
if exec(self, &mut locs, text, 0) {
Some(Captures {
text: text,
locs: locs,
named_groups: NamedGroups::from_regex(self)
})
} else {
None
}
@@ -804,6 +810,71 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
}
}

enum NamedGroups {
Empty,
Native(&'static [(&'static str, usize)]),
Dynamic(Arc<HashMap<String, usize>>),
}

impl NamedGroups {
fn from_regex(regex: &Regex) -> NamedGroups {
match *regex {
Regex::Native(ExNative { ref groups, .. }) =>
NamedGroups::Native(groups),
Regex::Dynamic(ref exec) => {
let groups = exec.named_groups();
if groups.is_empty() {
NamedGroups::Empty
} else {
NamedGroups::Dynamic(groups.clone())
}
}
}
}

fn pos(&self, name: &str) -> Option<usize> {
match *self {
NamedGroups::Empty => None,
NamedGroups::Native(groups) => {
groups.binary_search_by(|&(n, _)| n.cmp(name))
.ok().map(|i| groups[i].1)
},
NamedGroups::Dynamic(ref groups) => {
groups.get(name).map(|i| *i)
},
}
}

fn iter<'n>(&'n self) -> NamedGroupsIter<'n> {
match *self {
NamedGroups::Empty => NamedGroupsIter::Empty,
NamedGroups::Native(g) => NamedGroupsIter::Native(g.iter()),
NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()),
}
}
}

enum NamedGroupsIter<'n> {
Empty,
Native(::std::slice::Iter<'static, (&'static str, usize)>),
Dynamic(::std::collections::hash_map::Iter<'n, String, usize>),
}

impl<'n> Iterator for NamedGroupsIter<'n> {
type Item = (&'n str, usize);

fn next(&mut self) -> Option<Self::Item> {
match *self {
NamedGroupsIter::Empty =>
None,
NamedGroupsIter::Native(ref mut it) =>
it.next().map(|&v| v),
NamedGroupsIter::Dynamic(ref mut it) =>
it.next().map(|(s, i)| (s.as_ref(), *i))
}
}
}

/// Captures represents a group of captured strings for a single match.
///
/// The 0th capture always corresponds to the entire match. Each subsequent
@@ -818,34 +889,10 @@ impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
pub struct Captures<'t> {
text: &'t str,
locs: Vec<Option<usize>>,
named: Option<HashMap<String, usize>>,
named_groups: NamedGroups,
}

impl<'t> Captures<'t> {
fn new(
re: &Regex,
search: &'t str,
locs: Vec<Option<usize>>,
) -> Captures<'t> {
let named =
if re.captures_len() == 0 {
None
} else {
let mut named = HashMap::new();
for (i, name) in re.capture_names().enumerate() {
if let Some(name) = name {
named.insert(name.to_owned(), i);
}
}
Some(named)
};
Captures {
text: search,
locs: locs,
named: named,
}
}

/// Returns the start and end positions of the Nth capture group.
/// Returns `None` if `i` is not a valid capture group or if the capture
/// group did not match anything.
@@ -874,15 +921,7 @@ impl<'t> Captures<'t> {
/// `name` isn't a valid capture group or didn't match anything, then
/// `None` is returned.
pub fn name(&self, name: &str) -> Option<&'t str> {
match self.named {
None => None,
Some(ref h) => {
match h.get(name) {
None => None,
Some(i) => self.at(*i),
}
}
}
self.named_groups.pos(name).and_then(|i| self.at(i))
}

/// Creates an iterator of all the capture groups in order of appearance
@@ -895,7 +934,7 @@ impl<'t> Captures<'t> {
/// appearance in the regular expression. Positions are byte indices
/// in terms of the original string matched.
pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
SubCapturesPos { idx: 0, caps: self, }
SubCapturesPos { idx: 0, locs: &self.locs }
}

/// Creates an iterator of all named groups as an tuple with the group
@@ -904,7 +943,7 @@ impl<'t> Captures<'t> {
pub fn iter_named(&'t self) -> SubCapturesNamed<'t> {
SubCapturesNamed {
caps: self,
inner: self.named.as_ref().map(|n| n.iter()),
names: self.named_groups.iter()
}
}

@@ -978,16 +1017,16 @@ impl<'t> Index<&'t str> for Captures<'t> {
/// An iterator over capture groups for a particular match of a regular
/// expression.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCaptures<'t> {
/// `'c` is the lifetime of the captures.
pub struct SubCaptures<'c> {
idx: usize,
caps: &'t Captures<'t>,
caps: &'c Captures<'c>,
}

impl<'t> Iterator for SubCaptures<'t> {
type Item = Option<&'t str>;
impl<'c> Iterator for SubCaptures<'c> {
type Item = Option<&'c str>;

fn next(&mut self) -> Option<Option<&'t str>> {
fn next(&mut self) -> Option<Option<&'c str>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.at(self.idx - 1))
@@ -1002,42 +1041,43 @@ impl<'t> Iterator for SubCaptures<'t> {
///
/// Positions are byte indices in terms of the original string matched.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCapturesPos<'t> {
/// `'c` is the lifetime of the captures.
pub struct SubCapturesPos<'c> {
idx: usize,
caps: &'t Captures<'t>,
locs: &'c [Option<usize>]
}

impl<'t> Iterator for SubCapturesPos<'t> {
impl<'c> Iterator for SubCapturesPos<'c> {
type Item = Option<(usize, usize)>;

fn next(&mut self) -> Option<Option<(usize, usize)>> {
if self.idx < self.caps.len() {
self.idx += 1;
Some(self.caps.pos(self.idx - 1))
} else {
None
if self.idx >= self.locs.len() {
return None
}
let r = match (self.locs[self.idx], self.locs[self.idx + 1]) {
(Some(s), Some(e)) => Some((s, e)),
(None, None) => None,
_ => unreachable!()
};
self.idx += 2;
Some(r)
}
}

/// An Iterator over named capture groups as a tuple with the group
/// name and the value.
///
/// `'t` is the lifetime of the matched text.
pub struct SubCapturesNamed<'t>{
caps: &'t Captures<'t>,
inner: Option<Iter<'t, String, usize>>,
/// `'c` is the lifetime of the captures.
pub struct SubCapturesNamed<'c> {
caps: &'c Captures<'c>,
names: NamedGroupsIter<'c>,
}

impl<'t> Iterator for SubCapturesNamed<'t> {
type Item = (&'t str, Option<&'t str>);
impl<'c> Iterator for SubCapturesNamed<'c> {
type Item = (&'c str, Option<&'c str>);

fn next(&mut self) -> Option<(&'t str, Option<&'t str>)> {
match self.inner.as_mut().map_or(None, |it| it.next()) {
Some((name, pos)) => Some((name, self.caps.at(*pos))),
None => None
}
fn next(&mut self) -> Option<(&'c str, Option<&'c str>)> {
self.names.next().map(|(name, pos)| (name, self.caps.at(pos)))
}
}

@@ -1081,7 +1121,11 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> {
}
self.last_end = e;
self.last_match = Some(self.last_end);
Some(Captures::new(self.re, self.search, caps))
Some(Captures {
text: self.search,
locs: caps,
named_groups: NamedGroups::from_regex(self.re),
})
}
}