Skip to content

Commit

Permalink
feat: add negation syntax in hex strings
Browse files Browse the repository at this point in the history
  • Loading branch information
vthib committed Mar 5, 2023
1 parent 8bc0a77 commit 9c21fd4
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 98 deletions.
7 changes: 7 additions & 0 deletions boreal-parser/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ impl Error {
.with_message("alphabets used for base64 and base64wide must be identical")
.with_labels(vec![Label::primary((), self.span.clone())]),

ErrorKind::CannotNegateMaskAll => Diagnostic::error()
.with_message("negating an unknown byte is not allowed")
.with_labels(vec![Label::primary((), self.span.clone())]),

ErrorKind::ExprTooDeep => Diagnostic::error()
.with_message("too many imbricated expressions")
.with_labels(vec![Label::primary((), self.span.clone())]),
Expand Down Expand Up @@ -168,6 +172,9 @@ pub enum ErrorKind {
/// Alphabets used for base64 and base64wide for the same string are not identical.
Base64AlphabetIncompatible,

/// The '~??' syntax cannot be used in a hex string
CannotNegateMaskAll,

/// An expression contains too many imbricated expressions.
ExprTooDeep,

Expand Down
57 changes: 55 additions & 2 deletions boreal-parser/src/hex_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ use super::types::{Input, ParseResult};
pub enum Token {
/// A fully declared byte, eg `9C`
Byte(u8),
/// Negation of a byte, eg `~9C`
NotByte(u8),
/// A masked byte, eg `?5`, `C?`, `??`
MaskedByte(u8, Mask),
/// Negation of a masked byte, eg `~?C`. The mask cannot be [`Mask::All`].
NotMaskedByte(u8, Mask),
/// A jump of unknown bytes, eg `[5-10]`, `[3-]`, ...
Jump(Jump),
/// Two possible list of tokens, eg `( 12 34 | 98 76 )`
Expand Down Expand Up @@ -82,6 +86,26 @@ fn byte(input: Input) -> ParseResult<u8> {
map(rtrim(hex_digit), move |digit1| (digit0 << 4) | digit1)(input)
}

/// Parse the not tokens.
fn not_token(input: Input) -> ParseResult<Token> {
let start = input.pos();

let (input, _) = char('~')(input)?;
let (input, token) = cut(alt((
map(byte, Token::NotByte),
map(masked_byte, |(b, mask)| Token::NotMaskedByte(b, mask)),
)))(input)?;

if let Token::NotMaskedByte(_, Mask::All) = &token {
return Err(nom::Err::Failure(Error::new(
input.get_span_from(start),
ErrorKind::CannotNegateMaskAll,
)));
}

Ok((input, token))
}

/// Parse a masked hex byte, ie X?, ?X or ??.
///
/// Equivalent to the `_MASKED_BYTE_` lexical pattern in libyara.
Expand Down Expand Up @@ -230,9 +254,9 @@ fn validate_jump_in_alternatives(jump: &Jump) -> Result<(), ErrorKind> {
/// This is equivalent to the `token_or_range` rule in `hex_grammar.y` in libyara.
fn hex_token(input: Input, in_alternatives: bool) -> ParseResult<Token> {
alt((
map(masked_byte, |(v, mask)| Token::MaskedByte(v, mask)),
// Always have at least one space after a byte or a masked byte
not_token,
map(byte, Token::Byte),
map(masked_byte, |(v, mask)| Token::MaskedByte(v, mask)),
|input| range_as_hex_token(input, in_alternatives),
alternatives,
))(input)
Expand Down Expand Up @@ -314,6 +338,35 @@ mod tests {
parse_err(masked_byte, "?G");
}

#[test]
fn test_parse_not_token() {
parse(not_token, "~23a", "a", Token::NotByte(0x23));
parse(
not_token,
"~?3b",
"b",
Token::NotMaskedByte(0x03, Mask::Left),
);
parse(
not_token,
"~F?",
"",
Token::NotMaskedByte(0x0F, Mask::Right),
);

parse_err(not_token, "~");
parse_err(not_token, "~1");
parse_err(not_token, "~1 2");
parse_err(not_token, "~ 12");
parse_err(not_token, "~??");
parse_err(not_token, "~g1");
parse_err(not_token, "~1g");
parse_err(not_token, "12");
parse_err(not_token, "?a");
parse_err(not_token, "a?");
parse_err(not_token, "??");
}

#[test]
fn test_range() {
parse(range, "[-] a", "a", Jump { from: 0, to: None });
Expand Down
47 changes: 30 additions & 17 deletions boreal/src/compiler/variable/hex_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,12 @@ pub(super) fn hex_string_to_ast(hex_string: Vec<Token>) -> Node {
fn hex_token_to_ast(token: Token) -> Node {
match token {
Token::Byte(b) => Node::Literal(b),
Token::MaskedByte(b, mask) => match mask {
Mask::Left => Node::Class(ClassKind::Bracketed(BracketedClass {
items: (0..=0xF)
.map(|i| BracketedClassItem::Literal((i << 4) + b))
.collect(),
negated: false,
})),
Mask::Right => {
let b = b << 4;
Node::Class(ClassKind::Bracketed(BracketedClass {
items: vec![BracketedClassItem::Range(b, b + 0x0F)],
negated: false,
}))
}
Mask::All => Node::Dot,
},
Token::NotByte(b) => Node::Class(ClassKind::Bracketed(BracketedClass {
items: vec![BracketedClassItem::Literal(b)],
negated: true,
})),
Token::MaskedByte(b, mask) => masked_byte_to_class(b, &mask, false),
Token::NotMaskedByte(b, mask) => masked_byte_to_class(b, &mask, true),
Token::Jump(jump) => {
let kind = match (jump.from, jump.to) {
(from, None) => RepetitionKind::Range(RepetitionRange::AtLeast(from)),
Expand All @@ -43,6 +33,25 @@ fn hex_token_to_ast(token: Token) -> Node {
}
}

fn masked_byte_to_class(byte: u8, mask: &Mask, negated: bool) -> Node {
match mask {
Mask::Left => Node::Class(ClassKind::Bracketed(BracketedClass {
items: (0..=0xF)
.map(|i| BracketedClassItem::Literal((i << 4) + byte))
.collect(),
negated,
})),
Mask::Right => {
let byte = byte << 4;
Node::Class(ClassKind::Bracketed(BracketedClass {
items: vec![BracketedClassItem::Range(byte, byte + 0x0F)],
negated,
}))
}
Mask::All => Node::Dot,
}
}

/// Can the hex string be expressed using only literals.
pub(super) fn can_use_only_literals(hex_string: &[Token]) -> bool {
let nb_literals = match count_total_literals(hex_string) {
Expand All @@ -53,20 +62,22 @@ pub(super) fn can_use_only_literals(hex_string: &[Token]) -> bool {
nb_literals < 100
}

/// Count the total of literals that would needed to exhaustively express the hex string.
/// Count the total number of literals that would be needed to exhaustively express the hex string.
fn count_total_literals(hex_string: &[Token]) -> Option<usize> {
let mut nb_lits = 1_usize;

for token in hex_string {
match token {
Token::Byte(_) => (),
Token::NotByte(_) => return None,
Token::Jump(_) => return None,
Token::MaskedByte(_, mask) => match mask {
Mask::Left | Mask::Right => {
nb_lits = nb_lits.checked_mul(16)?;
}
Mask::All => return None,
},
Token::NotMaskedByte(_, _) => return None,
Token::Alternatives(alts) => {
let mut nb_alts = 0_usize;
for alt in alts {
Expand All @@ -87,8 +98,10 @@ pub(super) fn hex_string_to_only_literals(hex_string: Vec<Token>) -> Vec<Vec<u8>
for token in hex_string {
match token {
Token::Byte(b) => literals.add_byte(b),
Token::NotByte(_) => unreachable!(),
Token::Jump(_) => unreachable!(),
Token::MaskedByte(b, mask) => literals.add_masked_byte(b, &mask),
Token::NotMaskedByte(_, _) => unreachable!(),
Token::Alternatives(alts) => literals.add_alternatives(alts),
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
rule a {
strings:
$a = { AB ~ ?? 0F }
condition:
$a
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
rule a {
strings:
$a = { AB ~?? 0F }
condition:
$a
}
156 changes: 77 additions & 79 deletions boreal/tests/it/libyara_compat/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1679,70 +1679,69 @@ fn test_hex_strings() {
false,
);

// FIXME: to enable
// check(
// "rule test {
// strings: $a = { 31 32 ~32 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// true,
// );
check(
"rule test {
strings: $a = { 31 32 ~32 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
true,
);

// check(
// "rule test {
// strings: $a = { 31 32 ~33 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// false,
// );
check(
"rule test {
strings: $a = { 31 32 ~33 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
false,
);

// check(
// "rule test {
// strings: $a = { ( 31 32 ~32 34 35 | 31 32 ~33 34 35 ) }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// true,
// );
check(
"rule test {
strings: $a = { ( 31 32 ~32 34 35 | 31 32 ~33 34 35 ) }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
true,
);

// check(
// "rule test {
// strings: $a = { 31 32 ~?2 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// true,
// );
check(
"rule test {
strings: $a = { 31 32 ~?2 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
true,
);

// check(
// "rule test {
// strings: $a = { 31 32 ~?3 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// false,
// );
check(
"rule test {
strings: $a = { 31 32 ~?3 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
false,
);

// check(
// "rule test {
// strings: $a = { 31 32 ~4? 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// true,
// );
check(
"rule test {
strings: $a = { 31 32 ~4? 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
true,
);

// check(
// "rule test {
// strings: $a = { 31 32 ~3? 34 35 }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// false,
// );
check(
"rule test {
strings: $a = { 31 32 ~3? 34 35 }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
false,
);

// check(
// "rule test {
// strings: $a = { ( 31 32 ~3? 34 35 | 31 32 ~?2 34 35 ) }
// condition: $a }",
// concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
// true,
// );
check(
"rule test {
strings: $a = { ( 31 32 ~3? 34 35 | 31 32 ~?2 34 35 ) }
condition: $a }",
&join_str(TEXT_1024_BYTES, "1234567890"),
true,
);

check(
"rule test {
Expand Down Expand Up @@ -1895,30 +1894,29 @@ fn test_hex_strings() {
"mem:2:40: error: unbounded jumps not allowed inside alternations (|)",
);

// FIXME: to enable
// // ERROR_INVALID_HEX_STRING
// check_err(
// "rule test {
// strings: $a = { 01 02 ~ }
// condition: $a ",
// "z",
// );
// ERROR_INVALID_HEX_STRING
check_err(
"rule test {
strings: $a = { 01 02 ~ }
condition: $a ",
"mem:2:32: error: syntax error",
);

// // ERROR_INVALID_HEX_STRING
// check_err(
// "rule test {
// strings: $a = { 01 ~0 11 }
// condition: $a ",
// "z",
// );
// ERROR_INVALID_HEX_STRING
check_err(
"rule test {
strings: $a = { 01 ~0 11 }
condition: $a ",
"mem:2:30: error: syntax error",
);

// // ERROR_INVALID_HEX_STRING
// check_err(
// "rule test {
// strings: $a = { 01 ~?? 11 }
// condition: $a ",
// "z",
// );
// ERROR_INVALID_HEX_STRING
check_err(
"rule test {
strings: $a = { 01 ~?? 11 }
condition: $a ",
"mem:2:28: error: negating an unknown byte is not allowed",
);
}

#[test]
Expand Down
Loading

0 comments on commit 9c21fd4

Please sign in to comment.