feat: add negation syntax in hex strings

vthib · Mar 5, 2023 · 9c21fd4 · 9c21fd4
1 parent 8bc0a77
commit 9c21fd4
Show file tree

Hide file tree

Showing 7 changed files with 258 additions and 98 deletions.
diff --git a/boreal-parser/src/error.rs b/boreal-parser/src/error.rs
@@ -40,6 +40,10 @@ impl Error {
                 .with_message("alphabets used for base64 and base64wide must be identical")
                 .with_labels(vec![Label::primary((), self.span.clone())]),
 
+            ErrorKind::CannotNegateMaskAll => Diagnostic::error()
+                .with_message("negating an unknown byte is not allowed")
+                .with_labels(vec![Label::primary((), self.span.clone())]),
+
             ErrorKind::ExprTooDeep => Diagnostic::error()
                 .with_message("too many imbricated expressions")
                 .with_labels(vec![Label::primary((), self.span.clone())]),
@@ -168,6 +172,9 @@ pub enum ErrorKind {
     /// Alphabets used for base64 and base64wide for the same string are not identical.
     Base64AlphabetIncompatible,
 
+    /// The '~??' syntax cannot be used in a hex string
+    CannotNegateMaskAll,
+
     /// An expression contains too many imbricated expressions.
     ExprTooDeep,
 

diff --git a/boreal-parser/src/hex_string.rs b/boreal-parser/src/hex_string.rs
@@ -18,8 +18,12 @@ use super::types::{Input, ParseResult};
 pub enum Token {
     /// A fully declared byte, eg `9C`
     Byte(u8),
+    /// Negation of a byte, eg `~9C`
+    NotByte(u8),
     /// A masked byte, eg `?5`, `C?`, `??`
     MaskedByte(u8, Mask),
+    /// Negation of a masked byte, eg `~?C`. The mask cannot be [`Mask::All`].
+    NotMaskedByte(u8, Mask),
     /// A jump of unknown bytes, eg `[5-10]`, `[3-]`, ...
     Jump(Jump),
     /// Two possible list of tokens, eg `( 12 34 | 98 76 )`
@@ -82,6 +86,26 @@ fn byte(input: Input) -> ParseResult<u8> {
     map(rtrim(hex_digit), move |digit1| (digit0 << 4) | digit1)(input)
 }
 
+/// Parse the not tokens.
+fn not_token(input: Input) -> ParseResult<Token> {
+    let start = input.pos();
+
+    let (input, _) = char('~')(input)?;
+    let (input, token) = cut(alt((
+        map(byte, Token::NotByte),
+        map(masked_byte, |(b, mask)| Token::NotMaskedByte(b, mask)),
+    )))(input)?;
+
+    if let Token::NotMaskedByte(_, Mask::All) = &token {
+        return Err(nom::Err::Failure(Error::new(
+            input.get_span_from(start),
+            ErrorKind::CannotNegateMaskAll,
+        )));
+    }
+
+    Ok((input, token))
+}
+
 /// Parse a masked hex byte, ie X?, ?X or ??.
 ///
 /// Equivalent to the `_MASKED_BYTE_` lexical pattern in libyara.
@@ -230,9 +254,9 @@ fn validate_jump_in_alternatives(jump: &Jump) -> Result<(), ErrorKind> {
 /// This is equivalent to the `token_or_range` rule in `hex_grammar.y` in libyara.
 fn hex_token(input: Input, in_alternatives: bool) -> ParseResult<Token> {
     alt((
-        map(masked_byte, |(v, mask)| Token::MaskedByte(v, mask)),
-        // Always have at least one space after a byte or a masked byte
+        not_token,
         map(byte, Token::Byte),
+        map(masked_byte, |(v, mask)| Token::MaskedByte(v, mask)),
         |input| range_as_hex_token(input, in_alternatives),
         alternatives,
     ))(input)
@@ -314,6 +338,35 @@ mod tests {
         parse_err(masked_byte, "?G");
     }
 
+    #[test]
+    fn test_parse_not_token() {
+        parse(not_token, "~23a", "a", Token::NotByte(0x23));
+        parse(
+            not_token,
+            "~?3b",
+            "b",
+            Token::NotMaskedByte(0x03, Mask::Left),
+        );
+        parse(
+            not_token,
+            "~F?",
+            "",
+            Token::NotMaskedByte(0x0F, Mask::Right),
+        );
+
+        parse_err(not_token, "~");
+        parse_err(not_token, "~1");
+        parse_err(not_token, "~1 2");
+        parse_err(not_token, "~ 12");
+        parse_err(not_token, "~??");
+        parse_err(not_token, "~g1");
+        parse_err(not_token, "~1g");
+        parse_err(not_token, "12");
+        parse_err(not_token, "?a");
+        parse_err(not_token, "a?");
+        parse_err(not_token, "??");
+    }
+
     #[test]
     fn test_range() {
         parse(range, "[-] a", "a", Jump { from: 0, to: None });

diff --git a/boreal/src/compiler/variable/hex_string.rs b/boreal/src/compiler/variable/hex_string.rs
@@ -10,22 +10,12 @@ pub(super) fn hex_string_to_ast(hex_string: Vec<Token>) -> Node {
 fn hex_token_to_ast(token: Token) -> Node {
     match token {
         Token::Byte(b) => Node::Literal(b),
-        Token::MaskedByte(b, mask) => match mask {
-            Mask::Left => Node::Class(ClassKind::Bracketed(BracketedClass {
-                items: (0..=0xF)
-                    .map(|i| BracketedClassItem::Literal((i << 4) + b))
-                    .collect(),
-                negated: false,
-            })),
-            Mask::Right => {
-                let b = b << 4;
-                Node::Class(ClassKind::Bracketed(BracketedClass {
-                    items: vec![BracketedClassItem::Range(b, b + 0x0F)],
-                    negated: false,
-                }))
-            }
-            Mask::All => Node::Dot,
-        },
+        Token::NotByte(b) => Node::Class(ClassKind::Bracketed(BracketedClass {
+            items: vec![BracketedClassItem::Literal(b)],
+            negated: true,
+        })),
+        Token::MaskedByte(b, mask) => masked_byte_to_class(b, &mask, false),
+        Token::NotMaskedByte(b, mask) => masked_byte_to_class(b, &mask, true),
         Token::Jump(jump) => {
             let kind = match (jump.from, jump.to) {
                 (from, None) => RepetitionKind::Range(RepetitionRange::AtLeast(from)),
@@ -43,6 +33,25 @@ fn hex_token_to_ast(token: Token) -> Node {
     }
 }
 
+fn masked_byte_to_class(byte: u8, mask: &Mask, negated: bool) -> Node {
+    match mask {
+        Mask::Left => Node::Class(ClassKind::Bracketed(BracketedClass {
+            items: (0..=0xF)
+                .map(|i| BracketedClassItem::Literal((i << 4) + byte))
+                .collect(),
+            negated,
+        })),
+        Mask::Right => {
+            let byte = byte << 4;
+            Node::Class(ClassKind::Bracketed(BracketedClass {
+                items: vec![BracketedClassItem::Range(byte, byte + 0x0F)],
+                negated,
+            }))
+        }
+        Mask::All => Node::Dot,
+    }
+}
+
 /// Can the hex string be expressed using only literals.
 pub(super) fn can_use_only_literals(hex_string: &[Token]) -> bool {
     let nb_literals = match count_total_literals(hex_string) {
@@ -53,20 +62,22 @@ pub(super) fn can_use_only_literals(hex_string: &[Token]) -> bool {
     nb_literals < 100
 }
 
-/// Count the total of literals that would needed to exhaustively express the hex string.
+/// Count the total number of literals that would be needed to exhaustively express the hex string.
 fn count_total_literals(hex_string: &[Token]) -> Option<usize> {
     let mut nb_lits = 1_usize;
 
     for token in hex_string {
         match token {
             Token::Byte(_) => (),
+            Token::NotByte(_) => return None,
             Token::Jump(_) => return None,
             Token::MaskedByte(_, mask) => match mask {
                 Mask::Left | Mask::Right => {
                     nb_lits = nb_lits.checked_mul(16)?;
                 }
                 Mask::All => return None,
             },
+            Token::NotMaskedByte(_, _) => return None,
             Token::Alternatives(alts) => {
                 let mut nb_alts = 0_usize;
                 for alt in alts {
@@ -87,8 +98,10 @@ pub(super) fn hex_string_to_only_literals(hex_string: Vec<Token>) -> Vec<Vec<u8>
     for token in hex_string {
         match token {
             Token::Byte(b) => literals.add_byte(b),
+            Token::NotByte(_) => unreachable!(),
             Token::Jump(_) => unreachable!(),
             Token::MaskedByte(b, mask) => literals.add_masked_byte(b, &mask),
+            Token::NotMaskedByte(_, _) => unreachable!(),
             Token::Alternatives(alts) => literals.add_alternatives(alts),
         }
     }

diff --git a/boreal/tests/assets/invalid_files/parsing/hex_string_invalid_not.yar b/boreal/tests/assets/invalid_files/parsing/hex_string_invalid_not.yar
@@ -0,0 +1,6 @@
+rule a {
+    strings:
+        $a = { AB ~ ?? 0F }
+    condition:
+        $a
+}
diff --git a/boreal/tests/assets/invalid_files/parsing/hex_string_not_mask_all.yar b/boreal/tests/assets/invalid_files/parsing/hex_string_not_mask_all.yar
@@ -0,0 +1,6 @@
+rule a {
+    strings:
+        $a = { AB ~?? 0F }
+    condition:
+        $a
+}
diff --git a/boreal/tests/it/libyara_compat/rules.rs b/boreal/tests/it/libyara_compat/rules.rs
@@ -1679,70 +1679,69 @@ fn test_hex_strings() {
         false,
     );
 
-    // FIXME: to enable
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~32 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     true,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~32 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        true,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~33 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     false,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~33 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        false,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { ( 31 32 ~32 34 35 | 31 32 ~33 34 35 ) }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     true,
-    // );
+    check(
+        "rule test {
+        strings: $a = { ( 31 32 ~32 34 35 | 31 32 ~33 34 35 ) }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        true,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~?2 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     true,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~?2 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        true,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~?3 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     false,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~?3 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        false,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~4? 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     true,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~4? 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        true,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { 31 32 ~3? 34 35 }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     false,
-    // );
+    check(
+        "rule test {
+        strings: $a = { 31 32 ~3? 34 35 }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        false,
+    );
 
-    // check(
-    //     "rule test {
-    //     strings: $a = { ( 31 32 ~3? 34 35 | 31 32 ~?2 34 35 ) }
-    //     condition: $a }",
-    //     concatcp!(TEXT_1024_BYTES, "1234567890").as_bytes(),
-    //     true,
-    // );
+    check(
+        "rule test {
+        strings: $a = { ( 31 32 ~3? 34 35 | 31 32 ~?2 34 35 ) }
+        condition: $a }",
+        &join_str(TEXT_1024_BYTES, "1234567890"),
+        true,
+    );
 
     check(
         "rule test {
@@ -1895,30 +1894,29 @@ fn test_hex_strings() {
         "mem:2:40: error: unbounded jumps not allowed inside alternations (|)",
     );
 
-    // FIXME: to enable
-    // // ERROR_INVALID_HEX_STRING
-    // check_err(
-    //     "rule test {
-    //     strings: $a = { 01 02 ~ }
-    //     condition: $a ",
-    //     "z",
-    // );
+    // ERROR_INVALID_HEX_STRING
+    check_err(
+        "rule test {
+        strings: $a = { 01 02 ~ }
+        condition: $a ",
+        "mem:2:32: error: syntax error",
+    );
 
-    // // ERROR_INVALID_HEX_STRING
-    // check_err(
-    //     "rule test {
-    //     strings: $a = { 01 ~0 11 }
-    //     condition: $a ",
-    //     "z",
-    // );
+    // ERROR_INVALID_HEX_STRING
+    check_err(
+        "rule test {
+        strings: $a = { 01 ~0 11 }
+        condition: $a ",
+        "mem:2:30: error: syntax error",
+    );
 
-    // // ERROR_INVALID_HEX_STRING
-    // check_err(
-    //     "rule test {
-    //     strings: $a = { 01 ~?? 11 }
-    //     condition: $a ",
-    //     "z",
-    // );
+    // ERROR_INVALID_HEX_STRING
+    check_err(
+        "rule test {
+        strings: $a = { 01 ~?? 11 }
+        condition: $a ",
+        "mem:2:28: error: negating an unknown byte is not allowed",
+    );
 }
 
 #[test]