@@ -5,17 +5,45 @@ import org.jsoup.nodes.Document
55import org.jsoup.nodes.Entities
66
77object HtmlSimplifier {
8- val log = org.slf4j.LoggerFactory .getLogger(HtmlSimplifier ::class .java)
8+ private val log = org.slf4j.LoggerFactory .getLogger(HtmlSimplifier ::class .java)
9+
10+ /* * Elements that can execute scripts or load external content */
11+ private val SCRIPT_ELEMENTS = setOf (
12+ " script" , " noscript" , " iframe"
13+ )
14+ /* * Elements that handle user input */
15+ private val INTERACTIVE_ELEMENTS = setOf (
16+ " form" , " input" , " textarea" , " button" , " select" , " option"
17+ )
18+ /* * Elements that load or display media content */
19+ private val MEDIA_ELEMENTS = setOf (
20+ " canvas" , " audio" , " video" , " source" , " track" , " picture"
21+ )
922 private val PRESERVED_ELEMENTS = setOf (
10- " p" , " div" , " span" , " table" , " tr" , " td" , " th" ,
11- " thead" , " tbody" , " tfoot" , " ul" , " ol" , " li" , " h1" , " h2" , " h3" , " h4" , " h5" , " h6" ,
12- " br" , " hr" , " img"
23+ " p" , " div" , " span" , " table" , " tr" , " td" , " th" , " thead" , " tbody" , " tfoot" , " ul" , " ol" , " li" , " h1" , " h2" , " h3" , " h4" , " h5" , " h6" , " br" , " hr" , " img"
1324 )
1425 private val DEFAULT_IMPORTANT_ATTRIBUTES = setOf (
15- " href" , " src" , " alt" , " title" , " style" ,
16- " class" , " name" , " rel" , " type" , " content" ,
17- " colspan" , " rowspan" , " scope" , " id" , " lang" ,
18- " aria-label" , " aria-describedby" , " role"
26+ " href" ,
27+ " src" ,
28+ " alt" ,
29+ " title" ,
30+ " style" ,
31+ " class" ,
32+ " name" ,
33+ " rel" ,
34+ " type" ,
35+ " content" ,
36+ " colspan" ,
37+ " rowspan" ,
38+ " scope" ,
39+ " id" ,
40+ " lang" ,
41+ " aria-label" ,
42+ " aria-describedby" ,
43+ " role"
44+ )
45+ private val SCRIPT_ATTRIBUTES = setOf (
46+ " onclick" , " onload" , " onsubmit" , " oninput" , " onchange"
1947 )
2048
2149 fun scrubHtml (
@@ -24,16 +52,27 @@ object HtmlSimplifier {
2452 includeCssData : Boolean = false,
2553 simplifyStructure : Boolean = true,
2654 keepObjectIds : Boolean = false,
27- preserveWhitespace : Boolean = false
55+ preserveWhitespace : Boolean = false,
56+ keepScriptElements : Boolean = false,
57+ keepInteractiveElements : Boolean = false,
58+ keepMediaElements : Boolean = false,
59+ keepEventHandlers : Boolean = false
2860 ): String {
61+ // Add input validation for baseUrl
62+ baseUrl?.let {
63+ require(! it.startsWith(" javascript:" ) && ! it.startsWith(" data:" )) { " Invalid base URL scheme" }
64+ }
2965 require(str.isNotBlank()) { " Input HTML cannot be blank" }
66+ require(! str.startsWith(" data:" )) { " Data URLs are not supported" }
67+ require(! str.startsWith(" javascript:" )) { " JavaScript URLs are not supported" }
68+
3069 val document: Document = try {
3170 if (null != baseUrl) Jsoup .parse(str, baseUrl) else Jsoup .parse(str)
3271 } catch (e: Exception ) {
3372 throw IllegalArgumentException (" Failed to parse HTML: ${e.message} " , e)
3473 }
3574
36- fun simplifyDocument (stepName : String = "", fn : Document .() -> Unit ) = try {
75+ fun simplifyDocument (stepName : String = "", fn : Document .() -> Unit ) = try {
3776 val prevDocSize = document.html().length
3877 val startTime = System .currentTimeMillis()
3978 document.fn()
@@ -44,76 +83,82 @@ object HtmlSimplifier {
4483 log.warn(" Failed to simplify HTML in ${stepName} : ${e.message} " , e)
4584 }
4685
47- simplifyDocument(stepName= " Setup" ) {
86+ simplifyDocument(stepName = " Setup" ) {
4887 outputSettings().prettyPrint(true )
4988 outputSettings().charset(" UTF-8" )
5089 outputSettings().escapeMode(Entities .EscapeMode .xhtml)
5190 outputSettings().syntax(Document .OutputSettings .Syntax .html)
5291 }
5392
54- simplifyDocument(stepName= " RemoveUnsafeElements" ) {
93+ simplifyDocument(stepName = " RemoveUnsafeElements" ) {
94+ val elementsToRemove = mutableListOf<String >()
95+ elementsToRemove.addAll(
96+ listOf (
97+ " link" , " meta" , " object" , " embed" , " applet" , " base" , " frame" , " frameset" , " marquee" , " blink"
98+ )
99+ )
100+ if (! keepScriptElements) elementsToRemove.addAll(SCRIPT_ELEMENTS )
101+ if (! keepInteractiveElements) elementsToRemove.addAll(INTERACTIVE_ELEMENTS )
102+ if (! keepMediaElements) elementsToRemove.addAll(MEDIA_ELEMENTS )
103+ if (! includeCssData) elementsToRemove.add(" style" )
55104 select(
56- """
57- script, link, meta, iframe, noscript,
58- object, embed, form, input, textarea,
59- button, svg,${if (includeCssData) " style," else " " }
60- canvas, audio, video, source, applet, base,
61- frame, frameset, marquee, blink
62- """ .trimIndent().replace(" \n " , " " )
105+ elementsToRemove.joinToString(" , " )
63106 ).remove()
64107 }
65108
66- simplifyDocument(stepName= " RemoveDataAttributes" ) {
109+ simplifyDocument(stepName = " RemoveDataAttributes" ) {
67110 select(" [data-*]" ).forEach { it.attributes().removeAll { attr -> attr.key.startsWith(" data-" ) } }
68111 }
69112
70- simplifyDocument(stepName= " RemoveEventHandlers" ) {
71- select(" *" ).forEach { element ->
72- element.attributes().removeAll { attr -> attr.key.lowercase().startsWith(" on" ) }
113+ simplifyDocument(stepName = " RemoveEventHandlers" ) {
114+ if (! keepEventHandlers) {
115+ select(" *" ).forEach { element ->
116+ element.attributes().removeAll { attr ->
117+ // Simplified condition and fixed logic error
118+ attr.key.lowercase().startsWith(" on" ) && attr.key !in SCRIPT_ATTRIBUTES
119+ }
120+ }
73121 }
74122 }
75123
76- simplifyDocument(stepName= " RemoveUnsafeAttributes" ) {
124+ simplifyDocument(stepName = " RemoveUnsafeAttributes" ) {
77125 select(" *" ).forEach { element ->
78126 element.attributes().forEach { attr ->
79- if (attr.value.contains(" javascript:" ) ||
80- attr.value.contains(" data:" ) ||
81- attr.value.contains(" vbscript:" ) ||
82- attr.value.contains(" file:" )
127+ if (! keepScriptElements && (attr.value.contains(" javascript:" ) || attr.value.contains(" data:" ) || attr.value.contains(" vbscript:" ) || attr.value.contains(
128+ " file:"
129+ ))
83130 ) {
84131 element.removeAttr(attr.key)
85132 }
86133 }
87134 }
88135 }
89136
90- simplifyDocument(stepName= " FilterAttributes" ) {
91- val importantAttributes = DEFAULT_IMPORTANT_ATTRIBUTES
92- .let { baseSet ->
137+ simplifyDocument(stepName = " FilterAttributes" ) {
138+ val importantAttributes = DEFAULT_IMPORTANT_ATTRIBUTES .let { baseSet ->
93139 when {
94140 includeCssData -> baseSet
95141 keepObjectIds -> baseSet - setOf (" style" , " class" , " width" , " height" , " target" )
96142 else -> baseSet - setOf (" style" , " class" , " id" , " width" , " height" , " target" )
97143 }
98- }
99- .toSet()
144+ }.toSet()
100145 select(" *" ).forEach { element ->
101146 element.attributes().removeAll { attr -> attr.key !in importantAttributes }
102147 }
103148 }
104149
105- simplifyDocument(stepName= " RemoveEmptyElements" ) {
150+ simplifyDocument(stepName = " RemoveEmptyElements" ) {
106151 select(" *:not(img)" ).forEach { element ->
107- if (element.text().isBlank() &&
108- element.attributes().isEmpty &&
109- ! element.select(" img, br, hr, iframe[src]" ).any() // Preserve non-empty iframes
152+ if (element.text().isBlank() &&
153+ element.attributes().isEmpty &&
154+ ! element.select(" img, br, hr, iframe[src], svg, source[src], track[src] " ).any() // Added more media elements
110155 ) {
111156 element.remove()
112157 }
113158 }
114159 }
115160
116- simplifyDocument(stepName= " CleanupHrefAttributes" ) {
161+ simplifyDocument(stepName = " CleanupHrefAttributes" ) {
117162 select(" a[href]" ).forEach { element ->
118163 val href = element.attr(" href" )
119164 if (href.startsWith(" javascript:" ) || href.startsWith(" data:" )) {
@@ -122,37 +167,43 @@ object HtmlSimplifier {
122167 }
123168 }
124169
125- simplifyDocument(stepName= " UnwrapSimpleTextElements" ) {
170+ simplifyDocument(stepName = " UnwrapSimpleTextElements" ) {
126171 select(" *" ).forEach { element ->
127- if (element.tagName() !in PRESERVED_ELEMENTS &&
128- element.childNodes().size == 1 &&
129- element.childNodes().first()?.nodeName() == " #text" &&
130- element.attributes().isEmpty()
172+ if (element.tagName() !in PRESERVED_ELEMENTS && element.childNodes().size == 1
173+ && element.childNodes().first()?.nodeName() == " #text" && element.attributes().isEmpty()
131174 ) {
132175 element.unwrap()
133176 }
134177 }
135178 }
136179
137- simplifyDocument(stepName= " ConvertRelativeUrls" ) {
138- select(" a[href]" ).forEach {
139- it.attr(" href" , it.absUrl(" href" ))
140- }
141- select(" img[src]" ).forEach {
142- it.attr(" src" , it.absUrl(" src" ))
180+ simplifyDocument(stepName = " ConvertRelativeUrls" ) {
181+ if (baseUrl != null ) {
182+ // Add more elements that might have URLs
183+ select(" a[href]" ).forEach {
184+ it.attr(" href" , it.absUrl(" href" ))
185+ }
186+ select(" img[src]" ).forEach {
187+ it.attr(" src" , it.absUrl(" src" ))
188+ }
189+ select(" source[src]" ).forEach {
190+ it.attr(" src" , it.absUrl(" src" ))
191+ }
192+ select(" track[src]" ).forEach {
193+ it.attr(" src" , it.absUrl(" src" ))
194+ }
143195 }
144196 }
145197
146- simplifyDocument(stepName= " RemoveInvalidAttributes" ) {
198+ simplifyDocument(stepName = " RemoveInvalidAttributes" ) {
147199 select(" *" ).forEach { element ->
148200 element.attributes().removeAll { attr ->
149- attr.value.isBlank() || attr.value == " null" ||
150- attr.value.contains(" javascript:" ) || attr.value.contains(" data:" )
201+ attr.value.isBlank() || attr.value == " null" || attr.value.contains(" javascript:" ) || attr.value.contains(" data:" )
151202 }
152203 }
153204 }
154205
155- simplifyDocument(stepName= " CleanupTextNodes" ) {
206+ simplifyDocument(stepName = " CleanupTextNodes" ) {
156207 select(" *" ).forEach { element ->
157208 element.textNodes().forEach { node ->
158209 val trimmed = if (preserveWhitespace) node.text() else node.text().trim()
@@ -162,19 +213,16 @@ object HtmlSimplifier {
162213 }
163214 }
164215
165- simplifyDocument(stepName= " SimplifyNestedStructure" ) {
166- while (simplifyStructure) select(" *" )
167- .filter { element -> (element.attributes().isEmpty && element.children().size == 1 ) }
168- .filter { element ->
216+ simplifyDocument(stepName = " SimplifyNestedStructure" ) {
217+ while (simplifyStructure) select(" *" ).filter { element -> (element.attributes().isEmpty && element.children().size == 1 ) }.filter { element ->
169218 val child = element.children().first() ? : return @filter false
170219 when {
171220 ! child.attributes().isEmpty -> false
172221 child.tagName() != element.tagName() -> false
173222 child.children().size > 1 -> false
174223 else -> true
175224 }
176- }
177- .firstOrNull()?.unwrap() ? : break
225+ }.firstOrNull()?.unwrap() ? : break
178226 }
179227
180228 return document.body().html() ? : " "
0 commit comments