Skip to content

Commit a11dbb1

Browse files
committed
Add IOSource#match_size() method
## Why? `StringScanner#match?()` is faster than `StringScanner#check()`. See: ruby/strscan#111 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.4/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.4 (2024-07-09 revision be1089c8ec) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 18.849 19.403 32.551 34.728 i/s - 100.000 times in 5.305314s 5.153743s 3.072111s 2.879488s sax 27.706 29.435 48.126 52.247 i/s - 100.000 times in 3.609376s 3.397367s 2.077880s 1.913973s pull 31.817 33.907 56.941 58.925 i/s - 100.000 times in 3.142961s 2.949250s 1.756193s 1.697082s stream 31.120 33.186 52.530 55.816 i/s - 100.000 times in 3.213334s 3.013325s 1.903689s 1.791600s Comparison: dom after(YJIT): 34.7 i/s before(YJIT): 32.6 i/s - 1.07x slower after: 19.4 i/s - 1.79x slower before: 18.8 i/s - 1.84x slower sax after(YJIT): 52.2 i/s before(YJIT): 48.1 i/s - 1.09x slower after: 29.4 i/s - 1.78x slower before: 27.7 i/s - 1.89x slower pull after(YJIT): 58.9 i/s before(YJIT): 56.9 i/s - 1.03x slower after: 33.9 i/s - 1.74x slower before: 31.8 i/s - 1.85x slower stream after(YJIT): 55.8 i/s before(YJIT): 52.5 i/s - 1.06x slower after: 33.2 i/s - 1.68x slower before: 31.1 i/s - 1.79x slower ``` - YJIT=ON : 1.03x - 1.09x faster - YJIT=OFF : 1.03x - 1.06x faster
1 parent 6a8c041 commit a11dbb1

File tree

2 files changed

+67
-42
lines changed

2 files changed

+67
-42
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -269,10 +269,10 @@ def pull_event
269269
@source.ensure_buffer
270270
if @document_status == nil
271271
start_position = @source.position
272-
if @source.match("<?", true)
272+
if @source.match_size("<?", true)
273273
return process_instruction
274-
elsif @source.match("<!", true)
275-
if @source.match("--", true)
274+
elsif @source.match_size("<!", true)
275+
if @source.match_size("--", true)
276276
md = @source.match(/(.*?)-->/um, true)
277277
if md.nil?
278278
raise REXML::ParseException.new("Unclosed comment", @source)
@@ -281,10 +281,10 @@ def pull_event
281281
raise REXML::ParseException.new("Malformed comment", @source)
282282
end
283283
return [ :comment, md[1] ]
284-
elsif @source.match("DOCTYPE", true)
284+
elsif @source.match_size("DOCTYPE", true)
285285
base_error_message = "Malformed DOCTYPE"
286-
unless @source.match(/\s+/um, true)
287-
if @source.match(">")
286+
unless @source.match_size(/\s+/um, true)
287+
if @source.match_size(">")
288288
message = "#{base_error_message}: name is missing"
289289
else
290290
message = "#{base_error_message}: invalid name"
@@ -293,10 +293,10 @@ def pull_event
293293
raise REXML::ParseException.new(message, @source)
294294
end
295295
name = parse_name(base_error_message)
296-
if @source.match(/\s*\[/um, true)
296+
if @source.match_size(/\s*\[/um, true)
297297
id = [nil, nil, nil]
298298
@document_status = :in_doctype
299-
elsif @source.match(/\s*>/um, true)
299+
elsif @source.match_size(/\s*>/um, true)
300300
id = [nil, nil, nil]
301301
@document_status = :after_doctype
302302
@source.ensure_buffer
@@ -308,9 +308,9 @@ def pull_event
308308
# For backward compatibility
309309
id[1], id[2] = id[2], nil
310310
end
311-
if @source.match(/\s*\[/um, true)
311+
if @source.match_size(/\s*\[/um, true)
312312
@document_status = :in_doctype
313-
elsif @source.match(/\s*>/um, true)
313+
elsif @source.match_size(/\s*>/um, true)
314314
@document_status = :after_doctype
315315
@source.ensure_buffer
316316
else
@@ -320,7 +320,7 @@ def pull_event
320320
end
321321
args = [:start_doctype, name, *id]
322322
if @document_status == :after_doctype
323-
@source.match(/\s*/um, true)
323+
@source.match_size(/\s*/um, true)
324324
@stack << [ :end_doctype ]
325325
end
326326
return args
@@ -331,14 +331,14 @@ def pull_event
331331
end
332332
end
333333
if @document_status == :in_doctype
334-
@source.match(/\s*/um, true) # skip spaces
334+
@source.match_size(/\s*/um, true) # skip spaces
335335
start_position = @source.position
336-
if @source.match("<!", true)
337-
if @source.match("ELEMENT", true)
336+
if @source.match_size("<!", true)
337+
if @source.match_size("ELEMENT", true)
338338
md = @source.match(/(.*?)>/um, true)
339339
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
340340
return [ :elementdecl, "<!ELEMENT" + md[1] ]
341-
elsif @source.match("ENTITY", true)
341+
elsif @source.match_size("ENTITY", true)
342342
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
343343
unless match_data
344344
raise REXML::ParseException.new("Malformed entity declaration", @source)
@@ -370,7 +370,7 @@ def pull_event
370370
end
371371
match << '%' if ref
372372
return match
373-
elsif @source.match("ATTLIST", true)
373+
elsif @source.match_size("ATTLIST", true)
374374
md = @source.match(Private::ATTLISTDECL_END, true)
375375
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
376376
element = md[1]
@@ -390,10 +390,10 @@ def pull_event
390390
end
391391
end
392392
return [ :attlistdecl, element, pairs, contents ]
393-
elsif @source.match("NOTATION", true)
393+
elsif @source.match_size("NOTATION", true)
394394
base_error_message = "Malformed notation declaration"
395-
unless @source.match(/\s+/um, true)
396-
if @source.match(">")
395+
unless @source.match_size(/\s+/um, true)
396+
if @source.match_size(">")
397397
message = "#{base_error_message}: name is missing"
398398
else
399399
message = "#{base_error_message}: invalid name"
@@ -405,7 +405,7 @@ def pull_event
405405
id = parse_id(base_error_message,
406406
accept_external_id: true,
407407
accept_public_id: true)
408-
unless @source.match(/\s*>/um, true)
408+
unless @source.match_size(/\s*>/um, true)
409409
message = "#{base_error_message}: garbage before end >"
410410
raise REXML::ParseException.new(message, @source)
411411
end
@@ -419,7 +419,7 @@ def pull_event
419419
end
420420
elsif match = @source.match(/(%.*?;)\s*/um, true)
421421
return [ :externalentity, match[1] ]
422-
elsif @source.match(/\]\s*>/um, true)
422+
elsif @source.match_size(/\]\s*>/um, true)
423423
@document_status = :after_doctype
424424
return [ :end_doctype ]
425425
end
@@ -428,16 +428,16 @@ def pull_event
428428
end
429429
end
430430
if @document_status == :after_doctype
431-
@source.match(/\s*/um, true)
431+
@source.match_size(/\s*/um, true)
432432
end
433433
begin
434434
start_position = @source.position
435-
if @source.match("<", true)
435+
if @source.match_size("<", true)
436436
# :text's read_until may remain only "<" in buffer. In the
437437
# case, buffer is empty here. So we need to fill buffer
438438
# here explicitly.
439439
@source.ensure_buffer
440-
if @source.match("/", true)
440+
if @source.match_size("/", true)
441441
@namespaces_restore_stack.pop
442442
last_tag = @tags.pop
443443
md = @source.match(Private::CLOSE_PATTERN, true)
@@ -452,7 +452,7 @@ def pull_event
452452
raise REXML::ParseException.new(message, @source)
453453
end
454454
return [ :end_element, last_tag ]
455-
elsif @source.match("!", true)
455+
elsif @source.match_size("!", true)
456456
md = @source.match(/([^>]*>)/um)
457457
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
458458
raise REXML::ParseException.new("Malformed node", @source) unless md
@@ -470,7 +470,7 @@ def pull_event
470470
end
471471
raise REXML::ParseException.new( "Declarations can only occur "+
472472
"in the doctype declaration.", @source)
473-
elsif @source.match("?", true)
473+
elsif @source.match_size("?", true)
474474
return process_instruction
475475
else
476476
# Get the next tag
@@ -651,7 +651,7 @@ def need_source_encoding_update?(xml_declaration_encoding)
651651
def parse_name(base_error_message)
652652
md = @source.match(Private::NAME_PATTERN, true)
653653
unless md
654-
if @source.match(/\S/um)
654+
if @source.match_size(/\S/um)
655655
message = "#{base_error_message}: invalid name"
656656
else
657657
message = "#{base_error_message}: name is missing"
@@ -693,34 +693,34 @@ def parse_id_invalid_details(accept_external_id:,
693693
accept_public_id:)
694694
public = /\A\s*PUBLIC/um
695695
system = /\A\s*SYSTEM/um
696-
if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
697-
if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
696+
if (accept_external_id or accept_public_id) and @source.match_size(/#{public}/um)
697+
if @source.match_size(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
698698
return "public ID literal is missing"
699699
end
700-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
700+
unless @source.match_size(/#{public}\s+#{PUBIDLITERAL}/um)
701701
return "invalid public ID literal"
702702
end
703703
if accept_public_id
704-
if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
704+
if @source.match_size(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
705705
return "system ID literal is missing"
706706
end
707-
unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
707+
unless @source.match_size(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
708708
return "invalid system literal"
709709
end
710710
"garbage after system literal"
711711
else
712712
"garbage after public ID literal"
713713
end
714-
elsif accept_external_id and @source.match(/#{system}/um)
715-
if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
714+
elsif accept_external_id and @source.match_size(/#{system}/um)
715+
if @source.match_size(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
716716
return "system literal is missing"
717717
end
718-
unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
718+
unless @source.match_size(/#{system}\s+#{SYSTEMLITERAL}/um)
719719
return "invalid system literal"
720720
end
721721
"garbage after system literal"
722722
else
723-
unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
723+
unless @source.match_size(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
724724
return "invalid ID type"
725725
end
726726
"ID type is missing"
@@ -729,15 +729,15 @@ def parse_id_invalid_details(accept_external_id:,
729729

730730
def process_instruction
731731
name = parse_name("Malformed XML: Invalid processing instruction node")
732-
if @source.match(/\s+/um, true)
732+
if @source.match_size(/\s+/um, true)
733733
match_data = @source.match(/(.*?)\?>/um, true)
734734
unless match_data
735735
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
736736
end
737737
content = match_data[1]
738738
else
739739
content = nil
740-
unless @source.match("?>", true)
740+
unless @source.match_size("?>", true)
741741
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
742742
end
743743
end
@@ -767,17 +767,17 @@ def parse_attributes(prefixes)
767767
expanded_names = {}
768768
closed = false
769769
while true
770-
if @source.match(">", true)
770+
if @source.match_size(">", true)
771771
return attributes, closed
772-
elsif @source.match("/>", true)
772+
elsif @source.match_size("/>", true)
773773
closed = true
774774
return attributes, closed
775775
elsif match = @source.match(QNAME, true)
776776
name = match[1]
777777
prefix = match[2]
778778
local_part = match[3]
779779

780-
unless @source.match(/\s*=\s*/um, true)
780+
unless @source.match_size(/\s*=\s*/um, true)
781781
message = "Missing attribute equal: <#{name}>"
782782
raise REXML::ParseException.new(message, @source)
783783
end
@@ -793,7 +793,7 @@ def parse_attributes(prefixes)
793793
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
794794
raise REXML::ParseException.new(message, @source)
795795
end
796-
@source.match(/\s*/um, true)
796+
@source.match_size(/\s*/um, true)
797797
if prefix == "xmlns"
798798
if local_part == "xml"
799799
if value != Private::XML_PREFIXED_NAMESPACE

lib/rexml/source.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ def match(pattern, cons=false)
126126
end
127127
end
128128

129+
def match_size(pattern, cons=false)
130+
if cons
131+
@scanner.skip(pattern)
132+
else
133+
@scanner.match?(pattern)
134+
end
135+
end
136+
129137
def position
130138
@scanner.pos
131139
end
@@ -267,6 +275,23 @@ def match( pattern, cons=false )
267275
md.nil? ? nil : @scanner
268276
end
269277

278+
def match_size( pattern, cons=false )
279+
# To avoid performance issue, we need to increase bytes to read per scan
280+
min_bytes = 1
281+
while true
282+
if cons
283+
n_matched_bytes = @scanner.skip(pattern)
284+
else
285+
n_matched_bytes = @scanner.match?(pattern)
286+
end
287+
return n_matched_bytes if n_matched_bytes
288+
return nil if pattern.is_a?(String)
289+
return nil if @source.nil?
290+
return nil unless read(nil, min_bytes)
291+
min_bytes *= 2
292+
end
293+
end
294+
270295
def empty?
271296
super and ( @source.nil? || @source.eof? )
272297
end

0 commit comments

Comments
 (0)