Skip to content

Commit

Permalink
Merge pull request #137 from phereford/master
Browse files Browse the repository at this point in the history
Remove HTML style tags in preprocessing
  • Loading branch information
aviks committed Apr 1, 2019
2 parents 53799b2 + bbf872e commit c8ae7a2
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,12 @@ end
#
##############################################################################
const script_tags = Regex("<script\\b[^>]*>([\\s\\S]*?)</script>")
const style_tags = Regex("<style\\b[^>]*>([\\s\\S]*?)</style>")
const html_tags = Regex("<[^>]*>")

function remove_html_tags(s::AbstractString)
s = remove_patterns(s, script_tags)
s = remove_patterns(s, style_tags)
remove_patterns(s, html_tags)
end

Expand Down
20 changes: 20 additions & 0 deletions test/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,26 @@
remove_html_tags!(d)
@test "Hello world" == strip(d.text)

style_html_doc = StringDocument(
"""
<html>
<head>
<script language=\"javascript\"> x = 20; </script>
</head>
<body>
<style>
.fake-style {
color: #00ff00;
}
</style>
<h1>Hello</h1><a href=\"world\">world</a>
</body>
</html>
"""
)
remove_html_tags!(style_html_doc)
@test "Hello world" == strip(style_html_doc.text)

#Test #62
remove_corrupt_utf8("abc") == "abc"
remove_corrupt_utf8(String([0x43, 0xf0])) == "C "
Expand Down

0 comments on commit c8ae7a2

Please sign in to comment.