Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: correctly escape text that would otherwise be interpreted as raw HTML and HTML blocks. #438

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion src/turndown.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,45 @@ import { extend, trimLeadingNewlines, trimTrailingNewlines } from './utilities'
import RootNode from './root-node'
import Node from './node'
var reduce = Array.prototype.reduce
// Taken from `commonmark.js/lib/common.js`.
var TAGNAME = '[A-Za-z][A-Za-z0-9-]*'
var ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'
var UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"
var SINGLEQUOTEDVALUE = "'[^']*'"
var DOUBLEQUOTEDVALUE = '"[^"]*"'
var ATTRIBUTEVALUE =
'(?:' +
UNQUOTEDVALUE +
'|' +
SINGLEQUOTEDVALUE +
'|' +
DOUBLEQUOTEDVALUE +
')'
var ATTRIBUTEVALUESPEC = '(?:' + '\\s*=' + '\\s*' + ATTRIBUTEVALUE + ')'
var ATTRIBUTE = '(?:' + '\\s+' + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + '?)'
var OPENTAG = '<' + TAGNAME + ATTRIBUTE + '*' + '\\s*/?>'
var CLOSETAG = '</' + TAGNAME + '\\s*[>]'
var HTMLCOMMENT = '<!-->|<!--->|<!--(?:[^-]+|-[^-]|--[^>])*-->'
var PROCESSINGINSTRUCTION = '[<][?][\\s\\S]*?[?][>]'
var DECLARATION = '<![A-Z]+' + '[^>]*>'
var CDATA = '<!\\[CDATA\\[[\\s\\S]*?\\]\\]>'
var HTMLTAG =
'(?:' +
OPENTAG +
'|' +
CLOSETAG +
'|' +
// Note: Turndown removes comments, so this portion of the regex isn't
// necessary, but doesn't cause problems.
HTMLCOMMENT +
'|' +
PROCESSINGINSTRUCTION +
'|' +
DECLARATION +
'|' +
CDATA +
')'
// End of copied commonmark code.
var escapes = [
[/\\/g, '\\\\'],
[/\*/g, '\\*'],
Expand All @@ -17,7 +56,28 @@ var escapes = [
[/\]/g, '\\]'],
[/^>/g, '\\>'],
[/_/g, '\\_'],
[/^(\d+)\. /g, '$1\\. ']
[/^(\d+)\. /g, '$1\\. '],
// Per [section 6.6 of the CommonMark spec](https://spec.commonmark.org/0.30/#raw-html),
// Raw HTML, CommonMark recognizes and passes through HTML-like tags and
// their contents. Therefore, Turndown needs to escape text that would parse
// as an HTML-like tag. This regex recognizes these tags and escapes them by
// inserting a leading backslash.
[new RegExp(HTMLTAG, 'g'), '\\$&'],
// Likewise, [section 4.6 of the CommonMark spec](https://spec.commonmark.org/0.30/#html-blocks),
// HTML blocks, requires the same treatment.
//
// This regex was copied from `commonmark.js/lib/blocks.js`, the
// `reHtmlBlockOpen` variable. We only need regexps for patterns not matched
// by the previous pattern, so this doesn't need all expressions there.
//
// TODO: this is too aggressive; it should only recognize this pattern at
// the beginning of a line of CommonnMark source; these will recognize the
// pattern at the beginning of any inline or block markup. The approach I
// tried was to put this in `commonmark-rules.js` for the `paragraph` and
// `heading` rules (the only block beginning-of-line rules). However, text
// outside a paragraph/heading doesn't get escaped in this case.
[/^<(?:script|pre|textarea|style)(?:\s|>|$)/i, '\\$&'],
[/^<[/]?(?:address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)(?:\s|[/]?[>]|$)/i, '\\$&']
]

export default function TurndownService (options) {
Expand Down
31 changes: 31 additions & 0 deletions test/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,37 @@ <h2>This is a header.</h2>
<pre class="expected">` nasty code `</pre>
</div>

<div class="case" data-name="Correct escaping of inline raw HTML">
<div class="input">Test &lt;code&gt;tags&lt;/code&gt;, &lt;!-- comments --&gt;, &lt;?processing instructions?&gt;, &lt;!A declaration&gt;, and &lt;![CDATA[character data]]&gt;.</div>
<pre class="expected">Test \&lt;code&gt;tags\&lt;/code&gt;, \&lt;!-- comments --&gt;, \&lt;?processing instructions?&gt;, \&lt;!A declaration&gt;, and &lt;!\[CDATA\[character data\]\]&gt;.</pre>
</div>

<div class="case" data-name="Correct escaping of multi-line raw inline HTML">
<div class="input">Test &lt;code&gt;multi-line
tags&lt;/code&gt;, &lt;!-- multi-line
comments --&gt;, &lt;?multi-line
processing instructions?&gt;, &lt;!A multi-line
declaration&gt;, and &lt;![CDATA[multi-line
character data]]&gt;.</div>
<pre class="expected">Test \&lt;code&gt;multi-line tags\&lt;/code&gt;, \&lt;!-- multi-line comments --&gt;, \&lt;?multi-line processing instructions?&gt;, \&lt;!A multi-line declaration&gt;, and &lt;!\[CDATA\[multi-line character data\]\]&gt;.</pre>
</div>

<div class="case" data-name="Correct escaping of HTML blocks">
<div class="input"><p>&lt;pre</p> <p>&lt;script</p> <p>&lt;style</p> <p>&lt;textarea</p> <p>&lt;address</p> <p>&lt;ul</p></div>
<pre class="expected">\&lt;pre

\&lt;script

\&lt;style

\&lt;textarea

\&lt;address

\&lt;ul</pre>
</div>


<!-- /TEST CASES -->

<script src="turndown-test.browser.js"></script>
Expand Down