-
Notifications
You must be signed in to change notification settings - Fork 442
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Execbuilder::new() and ExecBuilder::new().nfa() disagree on .is_match() #321
Comments
ping. the fuzzer is running idle as it get clobbered by the above :-D No harm done though. |
When doing literal extraction, a non-empty concatenation should always be cut when a `^` (for prefixes) or a `$` (for suffixes) is seen. If a counted repetition is used, e.g., `${2}`, then the cut detection fails. We add in a special case to handle it. Fixes rust-lang#321
Maybe it's my goggles that do nothing but |
This also fails extern crate regex;
pub const HAYSTACK_BYTES: [u8; 262] = [10,219,130,110,51,1,221,51,205,70,12,201,191,23,226,175,74,207,138,124,182,166,176,65,174,198,202,158,2,225,16,59,179,254,110,226,184,239,17,223,254,67,212,147,166,109,45,144,214,233,247,218,110,148,102,109,112,84,46,161,70,151,35,221,154,172,175,60,186,160,44,165,38,160,131,178,147,197,230,9,227,110,242,93,196,25,23,88,150,240,202,57,72,161,79,33,88,155,46,20,156,209,80,17,6,122,255,159,42,138,97,148,42,1,134,254,175,154,118,137,252,119,88,132,250,255,7,94,102,111,111,98,97,114,192,69,24,44,7,187,217,77,117,131,107,37,205,153,231,243,188,64,147,183,87,202,87,34,221,49,172,14,222,197,206,157,196,176,100,113,151,181,106,89,224,194,54,164,240,210,162,157,107,21,239,126,84,168,76,73,117,99,18,144,107,103,7,136,44,210,32,17,9,133,177,94,171,172,164,253,159,236,46,60,118,74,81,246,72,57,131,136,252,22,30,96,136,28,172,240,252,51,203,154,45,147,114,190,238,10,87,185,242,57,30,54,151,233,121,58,127,159,24,132,25,194,65,211,134,29,236,76,];
fn main() {
let pattern = ".";
let haystack = &HAYSTACK_BYTES;
let re0 = regex::internal::ExecBuilder::new(pattern).build().map(|b| b.into_byte_regex()).unwrap();
let re1 = regex::internal::ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.find_iter(haystack).collect::<Vec<_>>(),
re1.find_iter(haystack).collect::<Vec<_>>());
} |
This program completes successfully for me: extern crate regex;
use regex::Regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: [u8; 262] = [212,194,101,43,14,254,200,102,172,158,178,63,73,128,238,90,186,218,203,107,17,42,187,194,114,122,39,42,202,120,78,115,51,8,109,230,114,178,50,40,233,196,250,19,230,246,130,68,88,49,4,87,73,253,96,140,121,187,133,37,174,185,69,126,100,39,234,88,147,125,51,142,131,153,103,107,47,27,92,0,36,42,79,245,225,15,205,183,108,44,19,164,215,140,128,87,12,217,253,133,5,251,120,78,224,130,209,100,78,138,101,62,121,226,131,123,19,12,130,77,118,202,115,187,243,168,18,46,102,111,111,98,97,114,89,240,245,241,239,215,176,34,71,253,166,145,32,162,6,23,106,198,192,105,182,19,183,228,153,80,140,62,171,120,184,101,142,165,165,162,123,222,119,97,28,72,30,246,250,12,210,74,180,118,38,99,44,211,198,82,31,131,198,120,65,12,31,76,15,18,137,80,242,118,195,176,244,106,138,176,15,67,203,132,213,138,65,3,242,223,20,137,37,214,96,32,72,109,132,75,28,112,190,249,19,243,98,136,224,12,182,154,248,111,245,31,108,144,25,209,196,192,225,207,145,190,24,2,216,33,127,138,];
fn main() {
let pattern = "${2}ä";
let haystack = &HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
} But dammit, you're right, this one doesn't: extern crate regex;
use regex::Regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: [u8; 262] = [212,194,101,43,14,254,200,102,172,158,178,63,73,128,238,90,186,218,203,107,17,42,187,194,114,122,39,42,202,120,78,115,51,8,109,230,114,178,50,40,233,196,250,19,230,246,130,68,88,49,4,87,73,253,96,140,121,187,133,37,174,185,69,126,100,39,234,88,147,125,51,142,131,153,103,107,47,27,92,0,36,42,79,245,225,15,205,183,108,44,19,164,215,140,128,87,12,217,253,133,5,251,120,78,224,130,209,100,78,138,101,62,121,226,131,123,19,12,130,77,118,202,115,187,243,168,18,46,102,111,111,98,97,114,89,240,245,241,239,215,176,34,71,253,166,145,32,162,6,23,106,198,192,105,182,19,183,228,153,80,140,62,171,120,184,101,142,165,165,162,123,222,119,97,28,72,30,246,250,12,210,74,180,118,38,99,44,211,198,82,31,131,198,120,65,12,31,76,15,18,137,80,242,118,195,176,244,106,138,176,15,67,203,132,213,138,65,3,242,223,20,137,37,214,96,32,72,109,132,75,28,112,190,249,19,243,98,136,224,12,182,154,248,111,245,31,108,144,25,209,196,192,225,207,145,190,24,2,216,33,127,138,];
fn main() {
let pattern = ".";
let haystack = &HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
} |
Notice that the haystack is different for the |
@lukaslueg I think my first program in my previous comment matches your initial submission, no? |
It does, sorry, I changed it from under your nose. pub const HAYSTACK_BYTES: [u8; 262] = [10,219,130,110,51,1,221,51,205,70,12,201,191,23,226,175,74,207,138,124,182,166,176,65,174,198,202,158,2,225,16,59,179,254,110,226,184,239,17,223,254,67,212,147,166,109,45,144,214,233,247,218,110,148,102,109,112,84,46,161,70,151,35,221,154,172,175,60,186,160,44,165,38,160,131,178,147,197,230,9,227,110,242,93,196,25,23,88,150,240,202,57,72,161,79,33,88,155,46,20,156,209,80,17,6,122,255,159,42,138,97,148,42,1,134,254,175,154,118,137,252,119,88,132,250,255,7,94,102,111,111,98,97,114,192,69,24,44,7,187,217,77,117,131,107,37,205,153,231,243,188,64,147,183,87,202,87,34,221,49,172,14,222,197,206,157,196,176,100,113,151,181,106,89,224,194,54,164,240,210,162,157,107,21,239,126,84,168,76,73,117,99,18,144,107,103,7,136,44,210,32,17,9,133,177,94,171,172,164,253,159,236,46,60,118,74,81,246,72,57,131,136,252,22,30,96,136,28,172,240,252,51,203,154,45,147,114,190,238,10,87,185,242,57,30,54,151,233,121,58,127,159,24,132,25,194,65,211,134,29,236,76,]; with |
Neither of these programs fail for me on extern crate regex;
use regex::Regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: &'static [u8] = &[10,219,130,110,51,1,221,51,205,70,12,201,191,23,226,175,74,207,138,124,182,166,176,65,174,198,202,158,2,225,16,59,179,254,110,226,184,239,17,223,254,67,212,147,166,109,45,144,214,233,247,218,110,148,102,109,112,84,46,161,70,151,35,221,154,172,175,60,186,160,44,165,38,160,131,178,147,197,230,9,227,110,242,93,196,25,23,88,150,240,202,57,72,161,79,33,88,155,46,20,156,209,80,17,6,122,255,159,42,138,97,148,42,1,134,254,175,154,118,137,252,119,88,132,250,255,7,94,102,111,111,98,97,114,192,69,24,44,7,187,217,77,117,131,107,37,205,153,231,243,188,64,147,183,87,202,87,34,221,49,172,14,222,197,206,157,196,176,100,113,151,181,106,89,224,194,54,164,240,210,162,157,107,21,239,126,84,168,76,73,117,99,18,144,107,103,7,136,44,210,32,17,9,133,177,94,171,172,164,253,159,236,46,60,118,74,81,246,72,57,131,136,252,22,30,96,136,28,172,240,252,51,203,154,45,147,114,190,238,10,87,185,242,57,30,54,151,233,121,58,127,159,24,132,25,194,65,211,134,29,236,76,];
fn main() {
let pattern = "${2}ä";
let haystack = HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
} and extern crate regex;
use regex::Regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: &'static [u8] = &[10,219,130,110,51,1,221,51,205,70,12,201,191,23,226,175,74,207,138,124,182,166,176,65,174,198,202,158,2,225,16,59,179,254,110,226,184,239,17,223,254,67,212,147,166,109,45,144,214,233,247,218,110,148,102,109,112,84,46,161,70,151,35,221,154,172,175,60,186,160,44,165,38,160,131,178,147,197,230,9,227,110,242,93,196,25,23,88,150,240,202,57,72,161,79,33,88,155,46,20,156,209,80,17,6,122,255,159,42,138,97,148,42,1,134,254,175,154,118,137,252,119,88,132,250,255,7,94,102,111,111,98,97,114,192,69,24,44,7,187,217,77,117,131,107,37,205,153,231,243,188,64,147,183,87,202,87,34,221,49,172,14,222,197,206,157,196,176,100,113,151,181,106,89,224,194,54,164,240,210,162,157,107,21,239,126,84,168,76,73,117,99,18,144,107,103,7,136,44,210,32,17,9,133,177,94,171,172,164,253,159,236,46,60,118,74,81,246,72,57,131,136,252,22,30,96,136,28,172,240,252,51,203,154,45,147,114,190,238,10,87,185,242,57,30,54,151,233,121,58,127,159,24,132,25,194,65,211,134,29,236,76,];
fn main() {
let pattern = ".";
let haystack = HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
} |
did you assert_eq!(re0.find_iter(haystack).collect::<Vec<_>>(),
re1.find_iter(haystack).collect::<Vec<_>>()); ? |
It was possible for an invalid continuation byte to sneak through, which resulted in incorrect UTF-8 decoding results. Fixes rust-lang#321
@lukaslueg Indeed I did not. Hopefully all fixed now in #344. Thanks for your patience. :-) |
Fix a bug in UTF-8 decoding. It was possible for an invalid continuation byte to sneak through, which resulted in incorrect UTF-8 decoding results. Fixes #321
Blame my goggles again, but the following haystack and pattern pub const HAYSTACK_BYTES: [u8; 262] = [226,19,16,129,184,223,15,117,3,59,38,166,221,62,114,169,167,248,253,83,253,48,119,113,14,9,100,154,12,108,196,37,207,96,176,224,126,249,114,29,82,227,145,83,169,24,52,55,193,230,191,76,176,44,118,182,201,77,26,212,163,173,69,89,65,9,251,189,89,79,151,217,245,251,60,139,138,207,169,137,246,16,134,207,120,190,69,104,25,44,44,26,18,149,195,217,121,56,123,87,120,55,87,109,220,141,200,36,61,59,160,171,140,135,247,96,39,199,171,177,98,149,129,120,199,23,98,167,102,111,111,98,97,114,206,228,12,231,203,10,196,20,237,53,124,171,222,233,171,122,166,161,170,240,63,205,165,80,237,240,42,128,131,65,192,176,77,20,156,231,195,123,87,40,99,11,45,231,135,35,20,154,211,179,5,17,46,38,234,130,184,49,9,41,49,53,32,238,15,173,231,100,173,59,146,179,134,222,31,21,16,105,134,91,65,84,82,47,228,68,86,217,96,135,63,16,189,137,26,61,131,157,29,113,249,241,20,82,12,217,177,244,54,223,40,229,57,126,124,20,77,149,245,48,65,6,132,35,233,208,38,254,]; |
This program exits successfully on current master: extern crate regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: [u8; 262] = [226,19,16,129,184,223,15,117,3,59,38,166,221,62,114,169,167,248,253,83,253,48,119,113,14,9,100,154,12,108,196,37,207,96,176,224,126,249,114,29,82,227,145,83,169,24,52,55,193,230,191,76,176,44,118,182,201,77,26,212,163,173,69,89,65,9,251,189,89,79,151,217,245,251,60,139,138,207,169,137,246,16,134,207,120,190,69,104,25,44,44,26,18,149,195,217,121,56,123,87,120,55,87,109,220,141,200,36,61,59,160,171,140,135,247,96,39,199,171,177,98,149,129,120,199,23,98,167,102,111,111,98,97,114,206,228,12,231,203,10,196,20,237,53,124,171,222,233,171,122,166,161,170,240,63,205,165,80,237,240,42,128,131,65,192,176,77,20,156,231,195,123,87,40,99,11,45,231,135,35,20,154,211,179,5,17,46,38,234,130,184,49,9,41,49,53,32,238,15,173,231,100,173,59,146,179,134,222,31,21,16,105,134,91,65,84,82,47,228,68,86,217,96,135,63,16,189,137,26,61,131,157,29,113,249,241,20,82,12,217,177,244,54,223,40,229,57,126,124,20,77,149,245,48,65,6,132,35,233,208,38,254,];
fn main() {
let pattern = ".";
let haystack = &HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
assert_eq!(re0.find_iter(haystack).collect::<Vec<_>>(),
re1.find_iter(haystack).collect::<Vec<_>>());
} So does: extern crate regex;
use regex::internal::ExecBuilder;
pub const HAYSTACK_BYTES: [u8; 262] = [226,19,16,129,184,223,15,117,3,59,38,166,221,62,114,169,167,248,253,83,253,48,119,113,14,9,100,154,12,108,196,37,207,96,176,224,126,249,114,29,82,227,145,83,169,24,52,55,193,230,191,76,176,44,118,182,201,77,26,212,163,173,69,89,65,9,251,189,89,79,151,217,245,251,60,139,138,207,169,137,246,16,134,207,120,190,69,104,25,44,44,26,18,149,195,217,121,56,123,87,120,55,87,109,220,141,200,36,61,59,160,171,140,135,247,96,39,199,171,177,98,149,129,120,199,23,98,167,102,111,111,98,97,114,206,228,12,231,203,10,196,20,237,53,124,171,222,233,171,122,166,161,170,240,63,205,165,80,237,240,42,128,131,65,192,176,77,20,156,231,195,123,87,40,99,11,45,231,135,35,20,154,211,179,5,17,46,38,234,130,184,49,9,41,49,53,32,238,15,173,231,100,173,59,146,179,134,222,31,21,16,105,134,91,65,84,82,47,228,68,86,217,96,135,63,16,189,137,26,61,131,157,29,113,249,241,20,82,12,217,177,244,54,223,40,229,57,126,124,20,77,149,245,48,65,6,132,35,233,208,38,254,];
fn main() {
let pattern = "${2}ä";
let haystack = &HAYSTACK_BYTES;
let re0 = ExecBuilder::new(pattern).build().unwrap().into_byte_regex();
let re1 = ExecBuilder::new(pattern).nfa().build().unwrap().into_byte_regex();
assert_eq!(re0.is_match(haystack), re1.is_match(haystack));
assert_eq!(re0.find_iter(haystack).collect::<Vec<_>>(),
re1.find_iter(haystack).collect::<Vec<_>>());
} |
Turns out I didnt update the regex crate inside docker. Here is another one, pinned to 204e409. pub const HAYSTACK_BYTES: [u8; 262] = [111,180,136,164,123,86,126,154,252,75,56,84,139,176,104,161,162,133,242,129,92,87,113,21,115,187,36,62,133,143,236,49,58,59,39,223,254,78,136,51,71,173,204,51,26,128,236,61,69,32,83,213,177,62,25,108,104,210,162,220,220,66,61,107,20,217,110,203,191,151,6,25,73,149,48,159,38,143,245,66,139,105,164,244,170,14,159,225,75,214,161,218,79,18,85,14,224,173,4,57,89,167,130,88,24,141,52,159,177,173,196,230,187,6,113,95,123,176,89,112,79,235,58,61,165,91,202,8,102,111,111,98,97,114,175,149,0,83,42,81,134,22,55,50,252,101,40,123,2,149,4,255,191,195,81,83,191,11,3,120,251,61,121,247,42,2,118,134,221,248,78,254,71,80,31,254,79,125,220,54,173,72,182,120,23,52,240,107,65,194,215,139,10,87,18,168,37,121,218,65,163,173,20,179,237,62,232,120,210,14,118,20,18,100,55,160,92,144,23,26,215,17,168,157,147,92,38,78,34,199,79,157,129,86,38,139,12,115,215,54,86,40,103,212,211,182,31,156,225,92,241,240,191,146,155,8,208,65,121,47,80,138,];
let pattern = r"=\b|N"; |
@lukaslueg Blech. Word boundaries. Nice find. I shortened the pattern and the haystack a bit and filed a new bug: #345 Thanks! |
AFL found a regex pattern where the two engines differ on .is_match(). There are many more examples, I strongly suspect them to be of the same underlying cause, though.
Tested with git as of today. As far as I tested this, the assertion succeeds if the bogus repetition count is removed or set to 1 or if the 'ä' is turned into an ASCII-character.
The text was updated successfully, but these errors were encountered: