Skip to content

Commit bb3a2da

Browse files
committed
Update HtmlSimplifier.kt
1 parent 753a536 commit bb3a2da

File tree

1 file changed

+107
-59
lines changed
  • webui/src/main/kotlin/com/simiacryptus/skyenet/apps/plan/tools/online

1 file changed

+107
-59
lines changed

webui/src/main/kotlin/com/simiacryptus/skyenet/apps/plan/tools/online/HtmlSimplifier.kt

Lines changed: 107 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,45 @@ import org.jsoup.nodes.Document
55
import org.jsoup.nodes.Entities
66

77
object HtmlSimplifier {
8-
val log = org.slf4j.LoggerFactory.getLogger(HtmlSimplifier::class.java)
8+
private val log = org.slf4j.LoggerFactory.getLogger(HtmlSimplifier::class.java)
9+
10+
/** Elements that can execute scripts or load external content */
11+
private val SCRIPT_ELEMENTS = setOf(
12+
"script", "noscript", "iframe"
13+
)
14+
/** Elements that handle user input */
15+
private val INTERACTIVE_ELEMENTS = setOf(
16+
"form", "input", "textarea", "button", "select", "option"
17+
)
18+
/** Elements that load or display media content */
19+
private val MEDIA_ELEMENTS = setOf(
20+
"canvas", "audio", "video", "source", "track", "picture"
21+
)
922
private val PRESERVED_ELEMENTS = setOf(
10-
"p", "div", "span", "table", "tr", "td", "th",
11-
"thead", "tbody", "tfoot", "ul", "ol", "li", "h1", "h2", "h3", "h4", "h5", "h6",
12-
"br", "hr", "img"
23+
"p", "div", "span", "table", "tr", "td", "th", "thead", "tbody", "tfoot", "ul", "ol", "li", "h1", "h2", "h3", "h4", "h5", "h6", "br", "hr", "img"
1324
)
1425
private val DEFAULT_IMPORTANT_ATTRIBUTES = setOf(
15-
"href", "src", "alt", "title", "style",
16-
"class", "name", "rel", "type", "content",
17-
"colspan", "rowspan", "scope", "id", "lang",
18-
"aria-label", "aria-describedby", "role"
26+
"href",
27+
"src",
28+
"alt",
29+
"title",
30+
"style",
31+
"class",
32+
"name",
33+
"rel",
34+
"type",
35+
"content",
36+
"colspan",
37+
"rowspan",
38+
"scope",
39+
"id",
40+
"lang",
41+
"aria-label",
42+
"aria-describedby",
43+
"role"
44+
)
45+
private val SCRIPT_ATTRIBUTES = setOf(
46+
"onclick", "onload", "onsubmit", "oninput", "onchange"
1947
)
2048

2149
fun scrubHtml(
@@ -24,16 +52,27 @@ object HtmlSimplifier {
2452
includeCssData: Boolean = false,
2553
simplifyStructure: Boolean = true,
2654
keepObjectIds: Boolean = false,
27-
preserveWhitespace: Boolean = false
55+
preserveWhitespace: Boolean = false,
56+
keepScriptElements: Boolean = false,
57+
keepInteractiveElements: Boolean = false,
58+
keepMediaElements: Boolean = false,
59+
keepEventHandlers: Boolean = false
2860
): String {
61+
// Add input validation for baseUrl
62+
baseUrl?.let {
63+
require(!it.startsWith("javascript:") && !it.startsWith("data:")) { "Invalid base URL scheme" }
64+
}
2965
require(str.isNotBlank()) { "Input HTML cannot be blank" }
66+
require(!str.startsWith("data:")) { "Data URLs are not supported" }
67+
require(!str.startsWith("javascript:")) { "JavaScript URLs are not supported" }
68+
3069
val document: Document = try {
3170
if (null != baseUrl) Jsoup.parse(str, baseUrl) else Jsoup.parse(str)
3271
} catch (e: Exception) {
3372
throw IllegalArgumentException("Failed to parse HTML: ${e.message}", e)
3473
}
3574

36-
fun simplifyDocument(stepName:String = "", fn : Document.() -> Unit) = try {
75+
fun simplifyDocument(stepName: String = "", fn: Document.() -> Unit) = try {
3776
val prevDocSize = document.html().length
3877
val startTime = System.currentTimeMillis()
3978
document.fn()
@@ -44,76 +83,82 @@ object HtmlSimplifier {
4483
log.warn("Failed to simplify HTML in ${stepName}: ${e.message}", e)
4584
}
4685

47-
simplifyDocument(stepName="Setup") {
86+
simplifyDocument(stepName = "Setup") {
4887
outputSettings().prettyPrint(true)
4988
outputSettings().charset("UTF-8")
5089
outputSettings().escapeMode(Entities.EscapeMode.xhtml)
5190
outputSettings().syntax(Document.OutputSettings.Syntax.html)
5291
}
5392

54-
simplifyDocument(stepName="RemoveUnsafeElements") {
93+
simplifyDocument(stepName = "RemoveUnsafeElements") {
94+
val elementsToRemove = mutableListOf<String>()
95+
elementsToRemove.addAll(
96+
listOf(
97+
"link", "meta", "object", "embed", "applet", "base", "frame", "frameset", "marquee", "blink"
98+
)
99+
)
100+
if (!keepScriptElements) elementsToRemove.addAll(SCRIPT_ELEMENTS)
101+
if (!keepInteractiveElements) elementsToRemove.addAll(INTERACTIVE_ELEMENTS)
102+
if (!keepMediaElements) elementsToRemove.addAll(MEDIA_ELEMENTS)
103+
if (!includeCssData) elementsToRemove.add("style")
55104
select(
56-
"""
57-
script, link, meta, iframe, noscript,
58-
object, embed, form, input, textarea,
59-
button, svg,${if (includeCssData) " style," else ""}
60-
canvas, audio, video, source, applet, base,
61-
frame, frameset, marquee, blink
62-
""".trimIndent().replace("\n", "")
105+
elementsToRemove.joinToString(", ")
63106
).remove()
64107
}
65108

66-
simplifyDocument(stepName="RemoveDataAttributes") {
109+
simplifyDocument(stepName = "RemoveDataAttributes") {
67110
select("[data-*]").forEach { it.attributes().removeAll { attr -> attr.key.startsWith("data-") } }
68111
}
69112

70-
simplifyDocument(stepName="RemoveEventHandlers") {
71-
select("*").forEach { element ->
72-
element.attributes().removeAll { attr -> attr.key.lowercase().startsWith("on") }
113+
simplifyDocument(stepName = "RemoveEventHandlers") {
114+
if (!keepEventHandlers) {
115+
select("*").forEach { element ->
116+
element.attributes().removeAll { attr ->
117+
// Simplified condition and fixed logic error
118+
attr.key.lowercase().startsWith("on") && attr.key !in SCRIPT_ATTRIBUTES
119+
}
120+
}
73121
}
74122
}
75123

76-
simplifyDocument(stepName="RemoveUnsafeAttributes") {
124+
simplifyDocument(stepName = "RemoveUnsafeAttributes") {
77125
select("*").forEach { element ->
78126
element.attributes().forEach { attr ->
79-
if (attr.value.contains("javascript:") ||
80-
attr.value.contains("data:") ||
81-
attr.value.contains("vbscript:") ||
82-
attr.value.contains("file:")
127+
if (!keepScriptElements && (attr.value.contains("javascript:") || attr.value.contains("data:") || attr.value.contains("vbscript:") || attr.value.contains(
128+
"file:"
129+
))
83130
) {
84131
element.removeAttr(attr.key)
85132
}
86133
}
87134
}
88135
}
89136

90-
simplifyDocument(stepName="FilterAttributes") {
91-
val importantAttributes = DEFAULT_IMPORTANT_ATTRIBUTES
92-
.let { baseSet ->
137+
simplifyDocument(stepName = "FilterAttributes") {
138+
val importantAttributes = DEFAULT_IMPORTANT_ATTRIBUTES.let { baseSet ->
93139
when {
94140
includeCssData -> baseSet
95141
keepObjectIds -> baseSet - setOf("style", "class", "width", "height", "target")
96142
else -> baseSet - setOf("style", "class", "id", "width", "height", "target")
97143
}
98-
}
99-
.toSet()
144+
}.toSet()
100145
select("*").forEach { element ->
101146
element.attributes().removeAll { attr -> attr.key !in importantAttributes }
102147
}
103148
}
104149

105-
simplifyDocument(stepName="RemoveEmptyElements") {
150+
simplifyDocument(stepName = "RemoveEmptyElements") {
106151
select("*:not(img)").forEach { element ->
107-
if (element.text().isBlank() &&
108-
element.attributes().isEmpty &&
109-
!element.select("img, br, hr, iframe[src]").any() // Preserve non-empty iframes
152+
if (element.text().isBlank() &&
153+
element.attributes().isEmpty &&
154+
!element.select("img, br, hr, iframe[src], svg, source[src], track[src]").any() // Added more media elements
110155
) {
111156
element.remove()
112157
}
113158
}
114159
}
115160

116-
simplifyDocument(stepName="CleanupHrefAttributes") {
161+
simplifyDocument(stepName = "CleanupHrefAttributes") {
117162
select("a[href]").forEach { element ->
118163
val href = element.attr("href")
119164
if (href.startsWith("javascript:") || href.startsWith("data:")) {
@@ -122,37 +167,43 @@ object HtmlSimplifier {
122167
}
123168
}
124169

125-
simplifyDocument(stepName="UnwrapSimpleTextElements") {
170+
simplifyDocument(stepName = "UnwrapSimpleTextElements") {
126171
select("*").forEach { element ->
127-
if (element.tagName() !in PRESERVED_ELEMENTS &&
128-
element.childNodes().size == 1 &&
129-
element.childNodes().first()?.nodeName() == "#text" &&
130-
element.attributes().isEmpty()
172+
if (element.tagName() !in PRESERVED_ELEMENTS && element.childNodes().size == 1
173+
&& element.childNodes().first()?.nodeName() == "#text" && element.attributes().isEmpty()
131174
) {
132175
element.unwrap()
133176
}
134177
}
135178
}
136179

137-
simplifyDocument(stepName="ConvertRelativeUrls") {
138-
select("a[href]").forEach {
139-
it.attr("href", it.absUrl("href"))
140-
}
141-
select("img[src]").forEach {
142-
it.attr("src", it.absUrl("src"))
180+
simplifyDocument(stepName = "ConvertRelativeUrls") {
181+
if (baseUrl != null) {
182+
// Add more elements that might have URLs
183+
select("a[href]").forEach {
184+
it.attr("href", it.absUrl("href"))
185+
}
186+
select("img[src]").forEach {
187+
it.attr("src", it.absUrl("src"))
188+
}
189+
select("source[src]").forEach {
190+
it.attr("src", it.absUrl("src"))
191+
}
192+
select("track[src]").forEach {
193+
it.attr("src", it.absUrl("src"))
194+
}
143195
}
144196
}
145197

146-
simplifyDocument(stepName="RemoveInvalidAttributes") {
198+
simplifyDocument(stepName = "RemoveInvalidAttributes") {
147199
select("*").forEach { element ->
148200
element.attributes().removeAll { attr ->
149-
attr.value.isBlank() || attr.value == "null" ||
150-
attr.value.contains("javascript:") || attr.value.contains("data:")
201+
attr.value.isBlank() || attr.value == "null" || attr.value.contains("javascript:") || attr.value.contains("data:")
151202
}
152203
}
153204
}
154205

155-
simplifyDocument(stepName="CleanupTextNodes") {
206+
simplifyDocument(stepName = "CleanupTextNodes") {
156207
select("*").forEach { element ->
157208
element.textNodes().forEach { node ->
158209
val trimmed = if (preserveWhitespace) node.text() else node.text().trim()
@@ -162,19 +213,16 @@ object HtmlSimplifier {
162213
}
163214
}
164215

165-
simplifyDocument(stepName="SimplifyNestedStructure") {
166-
while (simplifyStructure) select("*")
167-
.filter { element -> (element.attributes().isEmpty && element.children().size == 1) }
168-
.filter { element ->
216+
simplifyDocument(stepName = "SimplifyNestedStructure") {
217+
while (simplifyStructure) select("*").filter { element -> (element.attributes().isEmpty && element.children().size == 1) }.filter { element ->
169218
val child = element.children().first() ?: return@filter false
170219
when {
171220
!child.attributes().isEmpty -> false
172221
child.tagName() != element.tagName() -> false
173222
child.children().size > 1 -> false
174223
else -> true
175224
}
176-
}
177-
.firstOrNull()?.unwrap() ?: break
225+
}.firstOrNull()?.unwrap() ?: break
178226
}
179227

180228
return document.body().html() ?: ""

0 commit comments

Comments
 (0)