Skip to content

Commit

Permalink
feat: generalize the relaxed_escape_sequences option to `relaxed_re…
Browse files Browse the repository at this point in the history
…_syntax`

With this change, the new option not only controls whether invalid escape sequences should be accepted, it also relaxes other cases in which the YARA-X is more strict with regular expressions than YARA. This is the case with characters that have a special meaning in a regular expression, but that YARA treats as literal if they appear in a context where this special meaning doesn't make sense.

For instance, YARA interprets the curly braces in `/foo{}bar/` as literal characters, while in `/foo{1,2}bar/` they are interpreted as part of the repetition operator `{1,2}`.  YARA-X doesn't accept `/foo{}bar/` as a valid regular expression, unless you enable the relaxed_re_syntax option.
  • Loading branch information
plusvic committed May 15, 2024
1 parent c639417 commit 14b4efe
Show file tree
Hide file tree
Showing 19 changed files with 240 additions and 176 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 12 additions & 14 deletions capi/include/yara_x.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,20 @@
// messages.
#define YRX_COLORIZE_ERRORS 1

// Flag passed to [`yrx_compiler_create`] for accepting invalid escape
// sequences in regular expressions.
// Flag passed to [`yrx_compiler_create`] that enables a more relaxed
// syntax check for regular expressions.
//
// Historically, YARA has accepted any character preceded by a backslash
// in a regular expression, regardless of whether the sequence is valid.
// For example, `\n`, `\t` and `\w` are valid escape sequences in a
// regexp, but `\N`, `\T` and `\j` are not. However, YARA accepts all of
// these sequences. Valid escape sequences are interpreted according to
// their special meaning (`\n` as a new-line, `\w` as a word character,
// etc.), while invalid escape sequences are interpreted simply as the
// character that appears after the backslash. Thus, `\N` becomes `N`,
// and `\j` becomes `j`.
// YARA-X enforces stricter regular expression syntax compared to YARA.
// For instance, YARA accepts invalid escape sequences and treats them
// as literal characters (e.g., \R is interpreted as a literal 'R'). It
// also allows some special characters to appear unescaped, inferring
// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
// literal, but in `/foo{0,1}bar/` they form the repetition operator
// `{0,1}`).
//
// When this flag is enabled, the YARA-X compiler exhibits the legacy
// behaviour and accepts invalid escape sequences.
#define YRX_RELAXED_RE_ESCAPE_SEQUENCES 2
// When this flag is set, YARA-X mimics YARA's behavior, allowing
// constructs that YARA-X doesn't accept by default.
#define YRX_RELAXED_RE_SYNTAX 2

typedef enum YRX_RESULT {
// Everything was OK.
Expand Down
30 changes: 14 additions & 16 deletions capi/src/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,25 @@ pub struct YRX_COMPILER<'a> {
/// messages.
pub const YRX_COLORIZE_ERRORS: u32 = 1;

/// Flag passed to [`yrx_compiler_create`] for accepting invalid escape
/// sequences in regular expressions.
/// Flag passed to [`yrx_compiler_create`] that enables a more relaxed
/// syntax check for regular expressions.
///
/// Historically, YARA has accepted any character preceded by a backslash
/// in a regular expression, regardless of whether the sequence is valid.
/// For example, `\n`, `\t` and `\w` are valid escape sequences in a
/// regexp, but `\N`, `\T` and `\j` are not. However, YARA accepts all of
/// these sequences. Valid escape sequences are interpreted according to
/// their special meaning (`\n` as a new-line, `\w` as a word character,
/// etc.), while invalid escape sequences are interpreted simply as the
/// character that appears after the backslash. Thus, `\N` becomes `N`,
/// and `\j` becomes `j`.
/// YARA-X enforces stricter regular expression syntax compared to YARA.
/// For instance, YARA accepts invalid escape sequences and treats them
/// as literal characters (e.g., \R is interpreted as a literal 'R'). It
/// also allows some special characters to appear unescaped, inferring
/// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
/// literal, but in `/foo{0,1}bar/` they form the repetition operator
/// `{0,1}`).
///
/// When this flag is enabled, the YARA-X compiler exhibits the legacy
/// behaviour and accepts invalid escape sequences.
pub const YRX_RELAXED_RE_ESCAPE_SEQUENCES: u32 = 2;
/// When this flag is set, YARA-X mimics YARA's behavior, allowing
/// constructs that YARA-X doesn't accept by default.
pub const YRX_RELAXED_RE_SYNTAX: u32 = 2;

fn _yrx_compiler_create<'a>(flags: u32) -> yara_x::Compiler<'a> {
let mut compiler = yara_x::Compiler::new();
if flags & YRX_RELAXED_RE_ESCAPE_SEQUENCES != 0 {
compiler.relaxed_re_escape_sequences(true);
if flags & YRX_RELAXED_RE_SYNTAX != 0 {
compiler.relaxed_re_syntax(true);
}
if flags & YRX_COLORIZE_ERRORS != 0 {
compiler.colorize_errors(true);
Expand Down
6 changes: 3 additions & 3 deletions cli/src/commands/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ pub fn compile() -> Command {
.help("Use file path as rule namespace"),
)
.arg(
arg!(--"relaxed-escape-sequences")
.help("Allow invalid escape sequences in regular expressions"),
arg!(--"relaxed-re-syntax")
.help("Use a more relaxed syntax check while parsing regular expressions"),
)
.arg(
Arg::new("define")
Expand Down Expand Up @@ -55,7 +55,7 @@ pub fn exec_compile(args: &ArgMatches) -> anyhow::Result<()> {
rules_path,
path_as_namespace,
external_vars,
args.get_flag("relaxed-escape-sequences"),
args.get_flag("relaxed-re-syntax"),
)?;

let output_file = File::create(output_path).with_context(|| {
Expand Down
4 changes: 2 additions & 2 deletions cli/src/commands/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,15 @@ pub fn compile_rules<'a, P>(
paths: P,
path_as_namespace: bool,
external_vars: Option<Vec<(String, Value)>>,
relaxed_re_escape_sequences: bool,
relaxed_re_syntax: bool,
) -> Result<Rules, anyhow::Error>
where
P: Iterator<Item = &'a PathBuf>,
{
let mut compiler: Compiler<'_> = Compiler::new();

compiler
.relaxed_re_escape_sequences(relaxed_re_escape_sequences)
.relaxed_re_syntax(relaxed_re_syntax)
.colorize_errors(stdout().is_tty());

if let Some(vars) = external_vars {
Expand Down
10 changes: 5 additions & 5 deletions cli/src/commands/scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ pub fn scan() -> Command {
.value_parser(value_parser!(u64).range(1..))
)
.arg(
arg!(--"relaxed-escape-sequences")
.help("Allow invalid escape sequences in regular expressions")
arg!(--"relaxed-re-syntax")
.help("Use a more relaxed syntax check while parsing regular expressions")
)
.arg(
arg!(-d --"define")
Expand Down Expand Up @@ -123,10 +123,10 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> {
);
}

if args.get_flag("relaxed-escape-sequences") {
if args.get_flag("relaxed-re-syntax") {
bail!(
"can't use '{}' together with '{}'",
Paint::bold("--relaxed-escape-sequences"),
Paint::bold("--relaxed-re-syntax"),
Paint::bold("--compiled-rules")
);
}
Expand Down Expand Up @@ -157,7 +157,7 @@ pub fn exec_scan(args: &ArgMatches) -> anyhow::Result<()> {
rules_path,
path_as_namespace,
external_vars.take(),
args.get_flag("relaxed-escape-sequences"),
args.get_flag("relaxed-re-syntax"),
)?
};

Expand Down
38 changes: 19 additions & 19 deletions go/compiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,32 +50,32 @@ func IgnoreModule(module string) CompileOption {
}
}

// RelaxedReEscapeSequences is an option for [NewCompiler] and [Compile] that
// determines whether invalid escape sequences in regular expressions should be
// accepted.
// RelaxedReSyntax is an option for [NewCompiler] and [Compile] that
// determines whether the compiler should adopt a more relaxed approach
// while parsing regular expressions.
//
// Historically, YARA has accepted any character preceded by a backslash in a
// regular expression, regardless of whether the sequence is valid. For example,
// `\n`, `\t` and `\w` are valid escape sequences in a regexp, but `\N`, `\T`
// and `\j` are not. However, YARA accepts all of these sequences. Valid escape
// sequences are interpreted according to their special meaning (`\n` as a
// new-line, `\w` as a word character, etc.), while invalid escape sequences are
// interpreted simply as the character that appears after the backslash. Thus,
// `\N` becomes `N`, and `\j` becomes `j`.
// YARA-X enforces stricter regular expression syntax compared to YARA.
// For instance, YARA accepts invalid escape sequences and treats them
// as literal characters (e.g., \R is interpreted as a literal 'R'). It
// also allows some special characters to appear unescaped, inferring
// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
// literal, but in `/foo{0,1}bar/` they form the repetition operator
// `{0,1}`).
//
// This option is disabled by default.
func RelaxedReEscapeSequences(yes bool) CompileOption {
// When this option is set, YARA-X mimics YARA's behavior, allowing
// constructs that YARA-X doesn't accept by default.
func RelaxedReSyntax(yes bool) CompileOption {
return func(c *Compiler) error {
c.relaxedReEscapeSequences = yes
c.relaxedReSyntax = yes
return nil
}
}

// Compiler represent a YARA compiler.
type Compiler struct {
cCompiler *C.YRX_COMPILER
relaxedReEscapeSequences bool
ignoredModules map[string]bool
cCompiler *C.YRX_COMPILER
relaxedReSyntax bool
ignoredModules map[string]bool
vars map[string]interface{}
}

Expand All @@ -93,8 +93,8 @@ func NewCompiler(opts... CompileOption) (*Compiler, error) {
}

flags := C.uint32_t(0)
if c.relaxedReEscapeSequences {
flags |= C.YRX_RELAXED_RE_ESCAPE_SEQUENCES
if c.relaxedReSyntax {
flags |= C.YRX_RELAXED_RE_SYNTAX
}

C.yrx_compiler_create(flags, &c.cCompiler)
Expand Down
4 changes: 2 additions & 2 deletions go/compiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ func TestUnsupportedModules(t *testing.T) {
assert.Len(t, matchingRules, 1)
}

func TestRelaxedReEscapeSequences(t *testing.T) {
func TestRelaxedReSyntax(t *testing.T) {
r, err := Compile(`
rule test { strings: $a = /\Release/ condition: $a }`,
RelaxedReEscapeSequences(true))
RelaxedReSyntax(true))
assert.NoError(t, err)
matchingRules, _ := r.Scan([]byte("Release"))
assert.Len(t, matchingRules, 1)
Expand Down
2 changes: 1 addition & 1 deletion lib/src/compiler/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub(in crate::compiler) struct CompileContext<'a, 'src, 'sym> {
pub vars: VarStack,

/// Allow invalid escape sequences in regular expressions.
pub relaxed_re_escape_sequences: bool,
pub relaxed_re_syntax: bool,
}

impl<'a, 'src, 'sym> CompileContext<'a, 'src, 'sym> {
Expand Down
4 changes: 2 additions & 2 deletions lib/src/compiler/ir/ast2ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>(
let hir = re::parser::Parser::new()
.force_case_insensitive(flags.contains(PatternFlags::Nocase))
.allow_mixed_greediness(false)
.relaxed_escape_sequences(ctx.relaxed_re_escape_sequences)
.relaxed_re_syntax(ctx.relaxed_re_syntax)
.parse(&pattern.regexp)
.map_err(|err| {
re_error_to_compile_error(ctx.report_builder, &pattern.regexp, err)
Expand Down Expand Up @@ -248,7 +248,7 @@ pub(in crate::compiler) fn expr_from_ast(

ast::Expr::Regexp(regexp) => {
re::parser::Parser::new()
.relaxed_escape_sequences(ctx.relaxed_re_escape_sequences)
.relaxed_re_syntax(ctx.relaxed_re_syntax)
.parse(regexp.as_ref())
.map_err(|err| { re_error_to_compile_error(ctx.report_builder, regexp, err)
})?;
Expand Down
41 changes: 20 additions & 21 deletions lib/src/compiler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,10 @@ struct Namespace {
/// ```
///
pub struct Compiler<'a> {
/// Allow invalid escape sequences in regexps.
relaxed_re_escape_sequences: bool,
/// Mimics YARA behaviour with respect to regular expressions, allowing
/// some constructs that are invalid in YARA-X by default, like invalid
/// escape sequences.
relaxed_re_syntax: bool,

/// Used for generating error and warning reports.
report_builder: ReportBuilder,
Expand Down Expand Up @@ -307,7 +309,7 @@ impl<'a> Compiler<'a> {
wasm_mod,
wasm_symbols,
wasm_exports,
relaxed_re_escape_sequences: false,
relaxed_re_syntax: false,
next_pattern_id: PatternId(0),
current_pattern_id: PatternId(0),
current_namespace: default_namespace,
Expand Down Expand Up @@ -521,7 +523,7 @@ impl<'a> Compiler<'a> {

let mut rules = Rules {
serialized_globals,
relaxed_re_escape_sequences: self.relaxed_re_escape_sequences,
relaxed_re_syntax: self.relaxed_re_syntax,
wasm_mod: compiled_wasm_mod,
ac: None,
num_patterns: self.next_pattern_id.0 as usize,
Expand Down Expand Up @@ -562,32 +564,29 @@ impl<'a> Compiler<'a> {
self
}

/// Allow invalid escape sequences in regular expressions.
/// Enables a more relaxed syntax check for regular expressions.
///
/// Historically, YARA has accepted any character preceded by a backslash
/// in a regular expression, regardless of whether the sequence is valid.
/// For example, `\n`, `\t` and `\w` are valid escape sequences in a
/// regexp, but `\N`, `\T` and `\j` are not. However, YARA accepts all of
/// these sequences. Valid escape sequences are interpreted according to
/// their special meaning (`\n` as a new-line, `\w` as a word character,
/// etc.), while invalid escape sequences are interpreted simply as the
/// character that appears after the backslash. Thus, `\N` becomes `N`,
/// and `\j` becomes `j`.
/// YARA-X enforces stricter regular expression syntax compared to YARA.
/// For instance, YARA accepts invalid escape sequences and treats them
/// as literal characters (e.g., \R is interpreted as a literal 'R'). It
/// also allows some special characters to appear unescaped, inferring
/// their meaning from the context (e.g., `{` and `}` in `/foo{}bar/` are
/// literal, but in `/foo{0,1}bar/` they form the repetition operator
/// `{0,1}`).
///
/// This controls whether the compiler should accept invalid escape
/// sequences and translate them to plain characters. Invalid escape
/// sequences are not accepted by default.
/// This setting controls whether the compiler should mimic YARA's behavior,
/// allowing constructs that YARA-X doesn't accept by default.
///
/// This should be called before any rule is added to the compiler.
///
/// # Panics
///
/// If called after adding rules to the compiler.
pub fn relaxed_re_escape_sequences(&mut self, yes: bool) -> &mut Self {
pub fn relaxed_re_syntax(&mut self, yes: bool) -> &mut Self {
if !self.rules.is_empty() {
panic!("calling relaxed_re_escape_sequences in non-empty compiler")
panic!("calling relaxed_re_syntax in non-empty compiler")
}
self.relaxed_re_escape_sequences = yes;
self.relaxed_re_syntax = yes;
self
}

Expand Down Expand Up @@ -751,7 +750,7 @@ impl<'a> Compiler<'a> {
let mut rule_patterns = Vec::new();

let mut ctx = CompileContext {
relaxed_re_escape_sequences: self.relaxed_re_escape_sequences,
relaxed_re_syntax: self.relaxed_re_syntax,
current_symbol_table: None,
symbol_table: &mut self.symbol_table,
ident_pool: &mut self.ident_pool,
Expand Down
4 changes: 2 additions & 2 deletions lib/src/compiler/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub struct Rules {

/// If `true`, the regular expressions in `regexp_pool` are allowed to
/// contain invalid escape sequences.
pub(in crate::compiler) relaxed_re_escape_sequences: bool,
pub(in crate::compiler) relaxed_re_syntax: bool,

/// Pool with literal strings used in the rules. Each literal has its
/// own [`LiteralId`], which can be used for retrieving the literal
Expand Down Expand Up @@ -220,7 +220,7 @@ impl Rules {
let re = types::Regexp::new(self.regexp_pool.get(regexp_id).unwrap());

let parser = re::parser::Parser::new()
.relaxed_escape_sequences(self.relaxed_re_escape_sequences);
.relaxed_re_syntax(self.relaxed_re_syntax);

let hir = parser.parse(&re).unwrap().into_inner();

Expand Down
2 changes: 1 addition & 1 deletion lib/src/compiler/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ fn globals_json() {
fn invalid_escape_sequences() {
let mut compiler = Compiler::new();

compiler.relaxed_re_escape_sequences(true);
compiler.relaxed_re_syntax(true);
compiler
.add_source(r#"rule test { strings: $a = /\Release/ condition: $a }"#)
.unwrap();
Expand Down
Loading

0 comments on commit 14b4efe

Please sign in to comment.