From bbf872eb3a1b277f89924eb2116f2e468ca2b3fc Mon Sep 17 00:00:00 2001 From: Patrick Hereford Date: Wed, 27 Mar 2019 10:48:49 -0400 Subject: [PATCH] Remove HTML style tags in preprocessing --- src/preprocessing.jl | 2 ++ test/preprocessing.jl | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 6606cdfe..651e792e 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -124,10 +124,12 @@ end # ############################################################################## const script_tags = Regex("]*>([\\s\\S]*?)") +const style_tags = Regex("]*>([\\s\\S]*?)") const html_tags = Regex("<[^>]*>") function remove_html_tags(s::AbstractString) s = remove_patterns(s, script_tags) + s = remove_patterns(s, style_tags) remove_patterns(s, html_tags) end diff --git a/test/preprocessing.jl b/test/preprocessing.jl index 69f977cc..6d2d598d 100644 --- a/test/preprocessing.jl +++ b/test/preprocessing.jl @@ -82,6 +82,26 @@ remove_html_tags!(d) @test "Hello world" == strip(d.text) + style_html_doc = StringDocument( + """ + + + + + + +

Hello

world + + + """ + ) + remove_html_tags!(style_html_doc) + @test "Hello world" == strip(style_html_doc.text) + #Test #62 remove_corrupt_utf8("abc") == "abc" remove_corrupt_utf8(String([0x43, 0xf0])) == "C "