Skip to content

Commit

Permalink
Accept String as a pattern at non head (#106)
Browse files Browse the repository at this point in the history
It supports non-head match cases such as StringScanner#scan_until.

If we use a String as a pattern, we can improve match performance.
Here is a result of the including benchmark. 

## CRuby

It shows String as a pattern is 1.18x faster than Regexp as a pattern.

```
$ benchmark-driver benchmark/check_until.yaml
Warming up --------------------------------------
              regexp     9.403M i/s -      9.548M times in 1.015459s (106.35ns/i)
          regexp_var     9.162M i/s -      9.248M times in 1.009479s (109.15ns/i)
              string     8.966M i/s -      9.274M times in 1.034343s (111.54ns/i)
          string_var    11.051M i/s -     11.190M times in 1.012538s (90.49ns/i)
Calculating -------------------------------------
              regexp    10.319M i/s -     28.209M times in 2.733707s (96.91ns/i)
          regexp_var    10.032M i/s -     27.485M times in 2.739807s (99.68ns/i)
              string     9.681M i/s -     26.897M times in 2.778397s (103.30ns/i)
          string_var    12.162M i/s -     33.154M times in 2.726046s (82.22ns/i)

Comparison:
          string_var:  12161920.6 i/s 
              regexp:  10318949.7 i/s - 1.18x  slower
          regexp_var:  10031617.6 i/s - 1.21x  slower
              string:   9680843.7 i/s - 1.26x  slower
```

## JRuby

It shows String as a pattern is 2.11x faster than Regexp as a pattern.

```
$ benchmark-driver benchmark/check_until.yaml
Warming up --------------------------------------
              regexp     7.591M i/s -      7.544M times in 0.993780s (131.74ns/i)
          regexp_var     6.143M i/s -      6.125M times in 0.997038s (162.77ns/i)
              string    14.135M i/s -     14.079M times in 0.996067s (70.75ns/i)
          string_var    14.079M i/s -     14.057M times in 0.998420s (71.03ns/i)
Calculating -------------------------------------
              regexp     9.409M i/s -     22.773M times in 2.420268s (106.28ns/i)
          regexp_var    10.116M i/s -     18.430M times in 1.821820s (98.85ns/i)
              string    21.389M i/s -     42.404M times in 1.982519s (46.75ns/i)
          string_var    20.897M i/s -     42.237M times in 2.021187s (47.85ns/i)

Comparison:
              string:  21389191.1 i/s 
          string_var:  20897327.5 i/s - 1.02x  slower
          regexp_var:  10116464.7 i/s - 2.11x  slower
              regexp:   9409222.3 i/s - 2.27x  slower
```

See:
https://github.com/jruby/jruby/blob/be7815ec02356a58891c8727bb448f0c6a826d96/core/src/main/java/org/jruby/util/StringSupport.java#L1706-L1736

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
naitoh and kou authored Sep 14, 2024
1 parent badf6db commit f9d96c4
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 33 deletions.
15 changes: 15 additions & 0 deletions benchmark/check_until.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
prelude: |-
$LOAD_PATH.unshift(File.expand_path("lib"))
require "strscan"
scanner = StringScanner.new("test string")
str = "string"
reg = /string/
benchmark:
regexp: |
scanner.check_until(/string/)
regexp_var: |
scanner.check_until(reg)
string: |
scanner.check_until("string")
string_var: |
scanner.check_until(str)
30 changes: 13 additions & 17 deletions ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -262,17 +262,6 @@ private IRubyObject extractBegLen(Ruby runtime, int beg, int len) {
// MRI: strscan_do_scan
private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succptr, boolean getstr, boolean headonly) {
final Ruby runtime = context.runtime;

if (headonly) {
if (!(regex instanceof RubyRegexp)) {
regex = regex.convertToString();
}
} else {
if (!(regex instanceof RubyRegexp)) {
throw runtime.newTypeError("wrong argument type " + regex.getMetaClass() + " (expected Regexp)");
}
}

check(context);

ByteList strBL = str.getByteList();
Expand Down Expand Up @@ -310,9 +299,9 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
}
if (ret < 0) return context.nil;
} else {
RubyString pattern = (RubyString) regex;
RubyString pattern = regex.convertToString();

str.checkEncoding(pattern);
Encoding patternEnc = str.checkEncoding(pattern);

if (restLen() < pattern.size()) {
return context.nil;
Expand All @@ -321,11 +310,18 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
ByteList patternBL = pattern.getByteList();
int patternSize = patternBL.realSize();

if (ByteList.memcmp(strBL.unsafeBytes(), strBeg + curr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) {
return context.nil;
if (headonly) {
if (ByteList.memcmp(strBL.unsafeBytes(), strBeg + curr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) {
return context.nil;
}
setRegisters(patternSize);
} else {
int pos = StringSupport.index(strBL, patternBL, strBeg + curr, patternEnc);
if (pos == -1) {
return context.nil;
}
setRegisters(patternSize + pos - curr);
}

setRegisters(patternSize);
}

setMatched();
Expand Down
25 changes: 14 additions & 11 deletions ext/strscan/strscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -686,14 +686,6 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
{
struct strscanner *p;

if (headonly) {
if (!RB_TYPE_P(pattern, T_REGEXP)) {
StringValue(pattern);
}
}
else {
Check_Type(pattern, T_REGEXP);
}
GET_SCANNER(self, p);

CLEAR_MATCH_STATUS(p);
Expand All @@ -714,14 +706,25 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
}
}
else {
StringValue(pattern);
rb_enc_check(p->str, pattern);
if (S_RESTLEN(p) < RSTRING_LEN(pattern)) {
return Qnil;
}
if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
return Qnil;

if (headonly) {
if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
return Qnil;
}
set_registers(p, RSTRING_LEN(pattern));
} else {
long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern),
CURPTR(p), S_RESTLEN(p), rb_enc_get(pattern));
if (pos == -1) {
return Qnil;
}
set_registers(p, RSTRING_LEN(pattern) + pos);
}
set_registers(p, RSTRING_LEN(pattern));
}

MATCHED(p);
Expand Down
80 changes: 75 additions & 5 deletions test/strscan/test_stringscanner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -262,15 +262,15 @@ def test_concat
end

def test_scan
s = create_string_scanner('stra strb strc', true)
s = create_string_scanner("stra strb\0strc", true)
tmp = s.scan(/\w+/)
assert_equal 'stra', tmp

tmp = s.scan(/\s+/)
assert_equal ' ', tmp

assert_equal 'strb', s.scan(/\w+/)
assert_equal ' ', s.scan(/\s+/)
assert_equal "\u0000", s.scan(/\0/)

tmp = s.scan(/\w+/)
assert_equal 'strc', tmp
Expand Down Expand Up @@ -312,11 +312,14 @@ def test_scan
end

def test_scan_string
s = create_string_scanner('stra strb strc')
s = create_string_scanner("stra strb\0strc")
assert_equal 'str', s.scan('str')
assert_equal 'str', s[0]
assert_equal 3, s.pos
assert_equal 'a ', s.scan('a ')
assert_equal 'strb', s.scan('strb')
assert_equal "\u0000", s.scan("\0")
assert_equal 'strc', s.scan('strc')

str = 'stra strb strc'.dup
s = create_string_scanner(str, false)
Expand Down Expand Up @@ -668,13 +671,47 @@ def test_exist_p
assert_equal(nil, s.exist?(/e/))
end

def test_exist_p_string
def test_exist_p_invalid_argument
s = create_string_scanner("test string")
assert_raise(TypeError) do
s.exist?(" ")
s.exist?(1)
end
end

def test_exist_p_string
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
s = create_string_scanner("test string")
assert_equal(3, s.exist?("s"))
assert_equal(0, s.pos)
s.scan("test")
assert_equal(2, s.exist?("s"))
assert_equal(4, s.pos)
assert_equal(nil, s.exist?("e"))
end

def test_scan_until
s = create_string_scanner("Foo Bar\0Baz")
assert_equal("Foo", s.scan_until(/Foo/))
assert_equal(3, s.pos)
assert_equal(" Bar", s.scan_until(/Bar/))
assert_equal(7, s.pos)
assert_equal(nil, s.skip_until(/Qux/))
assert_equal("\u0000Baz", s.scan_until(/Baz/))
assert_equal(11, s.pos)
end

def test_scan_until_string
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
s = create_string_scanner("Foo Bar\0Baz")
assert_equal("Foo", s.scan_until("Foo"))
assert_equal(3, s.pos)
assert_equal(" Bar", s.scan_until("Bar"))
assert_equal(7, s.pos)
assert_equal(nil, s.skip_until("Qux"))
assert_equal("\u0000Baz", s.scan_until("Baz"))
assert_equal(11, s.pos)
end

def test_skip_until
s = create_string_scanner("Foo Bar Baz")
assert_equal(3, s.skip_until(/Foo/))
Expand All @@ -684,6 +721,16 @@ def test_skip_until
assert_equal(nil, s.skip_until(/Qux/))
end

def test_skip_until_string
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
s = create_string_scanner("Foo Bar Baz")
assert_equal(3, s.skip_until("Foo"))
assert_equal(3, s.pos)
assert_equal(4, s.skip_until("Bar"))
assert_equal(7, s.pos)
assert_equal(nil, s.skip_until("Qux"))
end

def test_check_until
s = create_string_scanner("Foo Bar Baz")
assert_equal("Foo", s.check_until(/Foo/))
Expand All @@ -693,6 +740,16 @@ def test_check_until
assert_equal(nil, s.check_until(/Qux/))
end

def test_check_until_string
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
s = create_string_scanner("Foo Bar Baz")
assert_equal("Foo", s.check_until("Foo"))
assert_equal(0, s.pos)
assert_equal("Foo Bar", s.check_until("Bar"))
assert_equal(0, s.pos)
assert_equal(nil, s.check_until("Qux"))
end

def test_search_full
s = create_string_scanner("Foo Bar Baz")
assert_equal(8, s.search_full(/Bar /, false, false))
Expand All @@ -705,6 +762,19 @@ def test_search_full
assert_equal(11, s.pos)
end

def test_search_full_string
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
s = create_string_scanner("Foo Bar Baz")
assert_equal(8, s.search_full("Bar ", false, false))
assert_equal(0, s.pos)
assert_equal("Foo Bar ", s.search_full("Bar ", false, true))
assert_equal(0, s.pos)
assert_equal(8, s.search_full("Bar ", true, false))
assert_equal(8, s.pos)
assert_equal("Baz", s.search_full("az", true, true))
assert_equal(11, s.pos)
end

def test_peek
s = create_string_scanner("test string")
assert_equal("test st", s.peek(7))
Expand Down

0 comments on commit f9d96c4

Please sign in to comment.