From 4c26c2f090b82797ad569bd1974bdf0a903be39f Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 21 Jan 2020 14:22:32 +0100 Subject: [PATCH 1/2] Deduplicate Regexp literals Real world application contain many duplicated Regexp literals. From a rails/console in Redmine: ``` >> ObjectSpace.each_object(Regexp).count => 6828 >> ObjectSpace.each_object(Regexp).uniq.count => 4162 >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum => 4611957 # 4.4 MB total >> ObjectSpace.each_object(Regexp).to_a.map { |r| ObjectSpace.memsize_of(r) }.sum - ObjectSpace.each_object(Regexp).to_a.uniq.map { |r| ObjectSpace.memsize_of(r) }.sum => 1490601 # 1.42 MB could be saved ``` Here's the to 10 duplicated regexps in Redmine: ``` 147: /"/ 107: /\s+/ 103: // 89: /\n/ 83: /'/ 76: /\s+/m 37: /\d+/ 35: /\[/ 33: /./ 33: /\\./ ``` --- gc.c | 8 +-- internal/re.h | 1 + internal/vm.h | 1 + re.c | 128 ++++++++++++++++++++++++++++----------- test/ruby/test_regexp.rb | 7 +++ vm.c | 13 ++++ vm_core.h | 1 + 7 files changed, 118 insertions(+), 41 deletions(-) diff --git a/gc.c b/gc.c index 59313d35d671ee..937f841a2da8b1 100644 --- a/gc.c +++ b/gc.c @@ -86,6 +86,7 @@ #include "internal/object.h" #include "internal/proc.h" #include "internal/rational.h" +#include "internal/re.h" #include "internal/sanitizers.h" #include "internal/struct.h" #include "internal/symbol.h" @@ -2710,11 +2711,8 @@ obj_free(rb_objspace_t *objspace, VALUE obj) } break; case T_REGEXP: - if (RANY(obj)->as.regexp.ptr) { - onig_free(RANY(obj)->as.regexp.ptr); - RB_DEBUG_COUNTER_INC(obj_regexp_ptr); - } - break; + rb_reg_free(obj); + break; case T_DATA: if (DATA_PTR(obj)) { int free_immediately = FALSE; diff --git a/internal/re.h b/internal/re.h index 9cf84393457e36..2169df3db1203c 100644 --- a/internal/re.h +++ b/internal/re.h @@ -14,6 +14,7 @@ /* re.c */ VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline); +void rb_reg_free(VALUE re); VALUE rb_reg_check_preprocess(VALUE); long rb_reg_search0(VALUE, VALUE, long, int, int); VALUE rb_reg_match_p(VALUE re, VALUE str, long pos); diff --git a/internal/vm.h b/internal/vm.h index c5986a1c2494d1..91319cbbd8a7a6 100644 --- a/internal/vm.h +++ b/internal/vm.h @@ -115,6 +115,7 @@ int rb_vm_check_optimizable_mid(VALUE mid); VALUE rb_yield_refine_block(VALUE refinement, VALUE refinements); MJIT_STATIC VALUE ruby_vm_special_exception_copy(VALUE); PUREFUNC(st_table *rb_vm_fstring_table(void)); +PUREFUNC(st_table *rb_vm_regexp_literals_table(void)); MJIT_SYMBOL_EXPORT_BEGIN VALUE vm_exec(struct rb_execution_context_struct *, int); /* used in JIT-ed code */ diff --git a/re.c b/re.c index 44418ec064246d..4fe442b67af424 100644 --- a/re.c +++ b/re.c @@ -13,12 +13,15 @@ #include +#include "debug_counter.h" #include "encindex.h" +#include "gc.h" #include "internal.h" #include "internal/error.h" #include "internal/hash.h" #include "internal/imemo.h" #include "internal/re.h" +#include "internal/vm.h" #include "regint.h" #include "ruby/encoding.h" #include "ruby/re.h" @@ -2956,36 +2959,42 @@ rb_reg_new(const char *s, long len, int options) return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options); } -VALUE -rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) +static int +reg_lit_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing) { - VALUE re = rb_reg_alloc(); - onig_errmsg_buffer err = ""; + VALUE *new_re = (VALUE *)arg; + VALUE re = (VALUE)*key; - if (!str) str = rb_str_new(0,0); - if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { - rb_set_errinfo(rb_reg_error_desc(str, options, err)); - return Qnil; + if (existing) { + /* because of lazy sweep, str may be unmarked already and swept + * at next time */ + + if (rb_objspace_garbage_object_p(re)) { + *new_re = Qundef; + return ST_DELETE; + } + + *new_re = re; + return ST_STOP; + } else { + FL_SET(re, REG_LITERAL); + rb_obj_freeze(re); + + *key = *value = *new_re = re; + return ST_CONTINUE; } - FL_SET(re, REG_LITERAL); - rb_obj_freeze(re); - return re; } -static VALUE reg_cache; -VALUE -rb_reg_regcomp(VALUE str) +static st_index_t +reg_hash(VALUE re) { - if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) - && ENCODING_GET(reg_cache) == ENCODING_GET(str) - && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) - return reg_cache; - - return reg_cache = rb_reg_new_str(str, 0); + st_index_t hashval; + hashval = RREGEXP_PTR(re)->options; + hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re))); + return rb_hash_end(hashval); } -static st_index_t reg_hash(VALUE re); /* * call-seq: * rxp.hash -> integer @@ -2998,21 +3007,75 @@ static st_index_t reg_hash(VALUE re); static VALUE rb_reg_hash(VALUE re) { + rb_reg_check(re); st_index_t hashval = reg_hash(re); return ST2FIX(hashval); } -static st_index_t -reg_hash(VALUE re) +VALUE +rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline) { - st_index_t hashval; + VALUE re, ret; + onig_errmsg_buffer err = ""; + st_table *regexp_literals = rb_vm_regexp_literals_table(); - rb_reg_check(re); - hashval = RREGEXP_PTR(re)->options; - hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re))); - return rb_hash_end(hashval); + if (!str) str = rb_str_new(0,0); + re = rb_reg_alloc(); + if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) { + rb_set_errinfo(rb_reg_error_desc(str, options, err)); + return Qnil; + } + + do { + ret = re; + st_update(regexp_literals, (st_data_t)re, + reg_lit_update_callback, (st_data_t)&ret); + } while (ret == Qundef); + return ret; +} + +void +rb_reg_free(VALUE re) { + if (FL_TEST(re, REG_LITERAL)) { + st_data_t regexp_literal = (st_data_t)re; + st_delete(rb_vm_regexp_literals_table(), ®exp_literal, NULL); + } + + onig_free(RREGEXP_PTR(re)); + RB_DEBUG_COUNTER_INC(obj_regexp_ptr); +} + +static VALUE reg_cache; + +VALUE +rb_reg_regcomp(VALUE str) +{ + if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str) + && ENCODING_GET(reg_cache) == ENCODING_GET(str) + && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0) + return reg_cache; + + return reg_cache = rb_reg_new_str(str, 0); } +static int +reg_cmp(VALUE re1, VALUE re2) +{ + if (re1 == re2) return 0; + + if (!RB_TYPE_P(re2, T_REGEXP)) return 1; + if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return 1; + if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return 1; + if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return 1; + if (ENCODING_GET(re1) != ENCODING_GET(re2)) return 1; + + return memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)); +} + +const struct st_hash_type rb_regexp_literal_hash_type = { + reg_cmp, + reg_hash, +}; /* * call-seq: @@ -3035,14 +3098,7 @@ rb_reg_equal(VALUE re1, VALUE re2) if (re1 == re2) return Qtrue; if (!RB_TYPE_P(re2, T_REGEXP)) return Qfalse; rb_reg_check(re1); rb_reg_check(re2); - if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse; - if (RREGEXP_PTR(re1)->options != RREGEXP_PTR(re2)->options) return Qfalse; - if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse; - if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse; - if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) { - return Qtrue; - } - return Qfalse; + return reg_cmp(re1, re2) == 0 ? Qtrue : Qfalse; } /* diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 231fd392d17d0e..3411746cc9ebec 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -62,6 +62,13 @@ def test_assert_normal_exit Regexp.union("a", "a") end + def test_literal_deduplication + assert_same(/a/, /a/) + refute_same(/a/, /a/m) + refute_same(/a/, Regexp.new('a')) + assert_equal(/a/, Regexp.new('a')) + end + def test_to_s assert_equal '(?-mix:\x00)', Regexp.new("\0").to_s diff --git a/vm.c b/vm.c index e772d3b1124413..e2b568c53be029 100644 --- a/vm.c +++ b/vm.c @@ -2241,6 +2241,7 @@ rb_vm_update_references(void *ptr) if (ptr) { rb_vm_t *vm = ptr; rb_gc_update_tbl_refs(vm->frozen_strings); + rb_gc_update_tbl_refs(vm->regexp_literals); } } @@ -2354,6 +2355,10 @@ ruby_vm_destruct(rb_vm_t *vm) st_free_table(vm->frozen_strings); vm->frozen_strings = 0; } + if (vm->regexp_literals) { + st_free_table(vm->regexp_literals); + vm->regexp_literals = 0; + } rb_vm_gvl_destroy(vm); RB_ALTSTACK_FREE(vm->main_altstack); if (objspace) { @@ -3310,6 +3315,7 @@ rb_vm_set_progname(VALUE filename) } extern const struct st_hash_type rb_fstring_hash_type; +extern const struct st_hash_type rb_regexp_literal_hash_type; void Init_BareVM(void) @@ -3345,6 +3351,7 @@ Init_vm_objects(void) vm->mark_object_ary = rb_ary_tmp_new(128); vm->loading_table = st_init_strtable(); vm->frozen_strings = st_init_table_with_size(&rb_fstring_hash_type, 1000); + vm->regexp_literals = st_init_table_with_size(&rb_regexp_literal_hash_type, 1000); } /* top self */ @@ -3406,6 +3413,12 @@ rb_vm_fstring_table(void) return GET_VM()->frozen_strings; } +st_table * +rb_vm_regexp_literals_table(void) +{ + return GET_VM()->regexp_literals; +} + #if VM_COLLECT_USAGE_DETAILS #define HASH_ASET(h, k, v) rb_hash_aset((h), (st_data_t)(k), (st_data_t)(v)) diff --git a/vm_core.h b/vm_core.h index a2f6af3c3095ba..53c66ec6ba0612 100644 --- a/vm_core.h +++ b/vm_core.h @@ -662,6 +662,7 @@ typedef struct rb_vm_struct { VALUE *defined_strings; st_table *frozen_strings; + st_table *regexp_literals; const struct rb_builtin_function *builtin_function_table; int builtin_inline_index; From 25f19780a8a8b1c6d9b83bb71849acdf6a76e423 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 22 Jan 2020 21:04:14 +0100 Subject: [PATCH 2/2] Mark literal regexps src strings --- re.c | 3 ++- vm.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/re.c b/re.c index 4fe442b67af424..f59cc4d7040bc4 100644 --- a/re.c +++ b/re.c @@ -2980,7 +2980,8 @@ reg_lit_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int exi FL_SET(re, REG_LITERAL); rb_obj_freeze(re); - *key = *value = *new_re = re; + *key = *new_re = re; + *value = RREGEXP_SRC(re); return ST_CONTINUE; } } diff --git a/vm.c b/vm.c index e2b568c53be029..756767c793c486 100644 --- a/vm.c +++ b/vm.c @@ -2296,7 +2296,7 @@ rb_vm_mark(void *ptr) rb_hook_list_mark(&vm->global_hooks); rb_gc_mark_values(RUBY_NSIG, vm->trap_list.cmd); - + rb_mark_tbl_no_pin(vm->regexp_literals); mjit_mark(); }