Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove inline scripts from scraped articles #2111

Merged
merged 2 commits into from
Dec 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,69 +120,69 @@
const videoDisplayedWidth = Number(videoEl.getAttribute('width'))
let bestWidthDiff = 424242
let chosenVideoSourceEl: DominoElement
videoSourceEls.forEach((videoSourceEl: DominoElement) => {
// Ignore non-webm && non-audio sources
const videoSourceType = videoSourceEl.getAttribute('type')
if (!videoSourceEl.getAttribute('src').endsWith('.webm') && !videoSourceType.startsWith('audio')) {
DOMUtils.deleteNode(videoSourceEl)
return
}

// Handle audio content
if (videoSourceType.startsWith('audio/ogg')) {
chosenVideoSourceEl = videoSourceEl
return
} else if (videoSourceType.startsWith('audio')) {
DOMUtils.deleteNode(videoSourceEl)
return
}

// If undefined displayed width, then take the best <source> resolution
const videoSourceElWidth = Number(videoSourceEl.getAttribute('data-file-width') || videoSourceEl.getAttribute('data-width') || 0)
if (!videoDisplayedWidth) {
const chosenVideoSourceElWidth = chosenVideoSourceEl ? chosenVideoSourceEl.getAttribute('data-file-width') || chosenVideoSourceEl.getAttribute('data-width') || 0 : 0
if (videoSourceElWidth > chosenVideoSourceElWidth || (videoSourceElWidth === chosenVideoSourceElWidth && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
return
}
}

// Otherwise, choose <source> with better (smaller) width diff
else {
const widthDiff = Number(videoSourceElWidth - videoDisplayedWidth)

// If no source has been picked so far, just take this one
if (!chosenVideoSourceEl) {
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}

// Resolution of source is higher than displayed resolution
else if (widthDiff >= 0) {
if (bestWidthDiff < 0 || widthDiff < bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}
}

// Resolution of source is smaller than displayed resolution
else {
if (widthDiff > bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}
}
}

// Delete all other nodes
DOMUtils.deleteNode(videoSourceEl)
})

Check notice on line 185 in src/renderers/abstract.renderer.ts

View check run for this annotation

codefactor.io / CodeFactor

src/renderers/abstract.renderer.ts#L123-L185

Complex Method

return chosenVideoSourceEl
}
Expand Down Expand Up @@ -698,6 +698,14 @@
}
})

/*
* Because of CSP, some ZIM reader environments do not allow inline JS. See issues/2096.
*/
const scripts = Array.from(parsoidDoc.getElementsByTagName('script')) as DominoElement[]
audiodude marked this conversation as resolved.
Show resolved Hide resolved
for (const script of scripts) {
script.parentNode.removeChild(script)
}

/* Force display of element with that CSS class */
filtersConfig.cssClassDisplayList.map((classname: string) => {
const nodes: DominoElement[] = Array.from(parsoidDoc.getElementsByClassName(classname))
Expand Down
84 changes: 57 additions & 27 deletions test/unit/saveArticles.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,36 @@ describe('saveArticles', () => {
// Prague was correctly post-processed
expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined()
})

test('Removes inline JS', async () => {
const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org' }) // en wikipedia
downloader.setUrlsDirectors(rendererInstance, rendererInstance)
const articleId = 'Potato'
const articleUrl = downloader.getArticleUrl(articleId)
const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId])
const articlesDetail = mwRetToArticleDetail(_articleDetailsRet)
const { articleDetailXId } = RedisStore
const articleDetail = { title: articleId, timestamp: '2023-08-20T14:54:01Z' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)
articleDetailXId.setMany(articlesDetail)
const result = await downloader.getArticle(
downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
rendererInstance,
articleUrl,
dump,
articleDetail,
dump.isMainPage(articleId),
)

const articleDoc = domino.createDocument(result[0].html)

// Document has scripts that we added, but shouldn't have any without a `src` (inline).
const remainingInlineScripts = Array.from(articleDoc.querySelectorAll('script:not([src])'))
expect(remainingInlineScripts.length).toBe(0)
})
}

describe('applyOtherTreatments', () => {
Expand Down Expand Up @@ -280,37 +310,37 @@ describe('saveArticles', () => {
expect(fewestChildren).toBeLessThanOrEqual(1)
})
*/
})

test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'
test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'

const articleJsonObject = {
visualeditor: { oldid: 0 },
}
const articleJsonObject = {
visualeditor: { oldid: 0 },
}

const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}
const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
})
})

test('Load inline js from HTML', async () => {
Expand Down
Loading