Skip to content

Commit a579730

Browse files
authored
Optimize BaseParser#unnormalize method (#158)
## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.3/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.3 (2024-06-12 revision f1c7b6f435) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 17.704 18.106 34.215 33.806 i/s - 100.000 times in 5.648398s 5.523110s 2.922698s 2.958036s sax 25.664 25.302 48.429 48.602 i/s - 100.000 times in 3.896488s 3.952289s 2.064859s 2.057537s pull 28.966 29.215 61.710 62.068 i/s - 100.000 times in 3.452275s 3.422901s 1.620480s 1.611129s stream 28.291 28.426 53.860 55.548 i/s - 100.000 times in 3.534716s 3.517884s 1.856667s 1.800247s Comparison: dom before(YJIT): 34.2 i/s after(YJIT): 33.8 i/s - 1.01x slower after: 18.1 i/s - 1.89x slower before: 17.7 i/s - 1.93x slower sax after(YJIT): 48.6 i/s before(YJIT): 48.4 i/s - 1.00x slower before: 25.7 i/s - 1.89x slower after: 25.3 i/s - 1.92x slower pull after(YJIT): 62.1 i/s before(YJIT): 61.7 i/s - 1.01x slower after: 29.2 i/s - 2.12x slower before: 29.0 i/s - 2.14x slower stream after(YJIT): 55.5 i/s before(YJIT): 53.9 i/s - 1.03x slower after: 28.4 i/s - 1.95x slower before: 28.3 i/s - 1.96x slower ``` - YJIT=ON : 1.00x - 1.03x faster - YJIT=OFF : 0.98x - 1.02x faster
1 parent e6e07f2 commit a579730

File tree

2 files changed

+31
-4
lines changed

2 files changed

+31
-4
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,13 @@ module Private
132132
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
133133
PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
134134
ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
135+
CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
136+
CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
137+
DEFAULT_ENTITIES_PATTERNS = {}
138+
default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
139+
default_entities.each do |term|
140+
DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
141+
end
135142
end
136143
private_constant :Private
137144

@@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil )
504511

505512
# Unescapes all possible entities
506513
def unnormalize( string, entities=nil, filter=nil )
507-
rv = string.gsub( /\r\n?/, "\n" )
514+
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
508515
matches = rv.scan( REFERENCE_RE )
509516
return rv if matches.size == 0
510-
rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
517+
rv.gsub!( Private::CHARACTER_REFERENCES ) {
511518
m=$1
512519
m = "0#{m}" if m[0] == ?x
513520
[Integer(m)].pack('U*')
@@ -518,15 +525,15 @@ def unnormalize( string, entities=nil, filter=nil )
518525
unless filter and filter.include?(entity_reference)
519526
entity_value = entity( entity_reference, entities )
520527
if entity_value
521-
re = /&#{entity_reference};/
528+
re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
522529
rv.gsub!( re, entity_value )
523530
else
524531
er = DEFAULT_ENTITIES[entity_reference]
525532
rv.gsub!( er[0], er[2] ) if er
526533
end
527534
end
528535
end
529-
rv.gsub!( /&/, '&' )
536+
rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
530537
end
531538
rv
532539
end

test/test_pullparser.rb

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,26 @@ def test_entity_replacement
6262
end
6363
end
6464

65+
def test_character_references
66+
source = '<a>&#65;</a><b>&#x42;</b>'
67+
parser = REXML::Parsers::PullParser.new( source )
68+
element_name = ''
69+
while parser.has_next?
70+
event = parser.pull
71+
case event.event_type
72+
when :start_element
73+
element_name = event[0]
74+
when :text
75+
case element_name
76+
when 'a'
77+
assert_equal('A', event[1])
78+
when 'b'
79+
assert_equal('B', event[1])
80+
end
81+
end
82+
end
83+
end
84+
6585
def test_peek_unshift
6686
source = "<a><b/></a>"
6787
REXML::Parsers::PullParser.new(source)

0 commit comments

Comments
 (0)