@@ -132,6 +132,13 @@ module Private
132
132
GEDECL_PATTERN = "\\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
133
133
PEDECL_PATTERN = "\\ s+(%)\\ s+#{ NAME } \\ s+#{ PEDEF } \\ s*>"
134
134
ENTITYDECL_PATTERN = /(?:#{ GEDECL_PATTERN } )|(?:#{ PEDECL_PATTERN } )/um
135
+ CARRIAGE_RETURN_NEWLINE_PATTERN = /\r \n ?/
136
+ CHARACTER_REFERENCES = /�*((?:\d +)|(?:x[a-fA-F0-9]+));/
137
+ DEFAULT_ENTITIES_PATTERNS = { }
138
+ default_entities = [ 'gt' , 'lt' , 'quot' , 'apos' , 'amp' ]
139
+ default_entities . each do |term |
140
+ DEFAULT_ENTITIES_PATTERNS [ term ] = /&#{ term } ;/
141
+ end
135
142
end
136
143
private_constant :Private
137
144
@@ -504,10 +511,10 @@ def normalize( input, entities=nil, entity_filter=nil )
504
511
505
512
# Unescapes all possible entities
506
513
def unnormalize ( string , entities = nil , filter = nil )
507
- rv = string . gsub ( / \r \n ?/ , "\n " )
514
+ rv = string . gsub ( Private :: CARRIAGE_RETURN_NEWLINE_PATTERN , "\n " )
508
515
matches = rv . scan ( REFERENCE_RE )
509
516
return rv if matches . size == 0
510
- rv . gsub! ( /�*((?: \d +)|(?:x[a-fA-F0-9]+));/ ) {
517
+ rv . gsub! ( Private :: CHARACTER_REFERENCES ) {
511
518
m = $1
512
519
m = "0#{ m } " if m [ 0 ] == ?x
513
520
[ Integer ( m ) ] . pack ( 'U*' )
@@ -518,15 +525,15 @@ def unnormalize( string, entities=nil, filter=nil )
518
525
unless filter and filter . include? ( entity_reference )
519
526
entity_value = entity ( entity_reference , entities )
520
527
if entity_value
521
- re = /&#{ entity_reference } ;/
528
+ re = Private :: DEFAULT_ENTITIES_PATTERNS [ entity_reference ] || /&#{ entity_reference } ;/
522
529
rv . gsub! ( re , entity_value )
523
530
else
524
531
er = DEFAULT_ENTITIES [ entity_reference ]
525
532
rv . gsub! ( er [ 0 ] , er [ 2 ] ) if er
526
533
end
527
534
end
528
535
end
529
- rv . gsub! ( /& amp;/ , '&' )
536
+ rv . gsub! ( Private :: DEFAULT_ENTITIES_PATTERNS [ ' amp' ] , '&' )
530
537
end
531
538
rv
532
539
end
0 commit comments