Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove inline scripts from scraped articles #2111

Merged
merged 2 commits into from
Dec 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
@@ -120,69 +120,69 @@
const videoDisplayedWidth = Number(videoEl.getAttribute('width'))
let bestWidthDiff = 424242
let chosenVideoSourceEl: DominoElement
videoSourceEls.forEach((videoSourceEl: DominoElement) => {
// Ignore non-webm && non-audio sources
const videoSourceType = videoSourceEl.getAttribute('type')
if (!videoSourceEl.getAttribute('src').endsWith('.webm') && !videoSourceType.startsWith('audio')) {
DOMUtils.deleteNode(videoSourceEl)
return
}

// Handle audio content
if (videoSourceType.startsWith('audio/ogg')) {
chosenVideoSourceEl = videoSourceEl
return
} else if (videoSourceType.startsWith('audio')) {
DOMUtils.deleteNode(videoSourceEl)
return
}

// If undefined displayed width, then take the best <source> resolution
const videoSourceElWidth = Number(videoSourceEl.getAttribute('data-file-width') || videoSourceEl.getAttribute('data-width') || 0)
if (!videoDisplayedWidth) {
const chosenVideoSourceElWidth = chosenVideoSourceEl ? chosenVideoSourceEl.getAttribute('data-file-width') || chosenVideoSourceEl.getAttribute('data-width') || 0 : 0
if (videoSourceElWidth > chosenVideoSourceElWidth || (videoSourceElWidth === chosenVideoSourceElWidth && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
return
}
}

// Otherwise, choose <source> with better (smaller) width diff
else {
const widthDiff = Number(videoSourceElWidth - videoDisplayedWidth)

// If no source has been picked so far, just take this one
if (!chosenVideoSourceEl) {
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}

// Resolution of source is higher than displayed resolution
else if (widthDiff >= 0) {
if (bestWidthDiff < 0 || widthDiff < bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}
}

// Resolution of source is smaller than displayed resolution
else {
if (widthDiff > bestWidthDiff || (widthDiff === bestWidthDiff && videoSourceEl.getAttribute('src').endsWith('.vp9.webm'))) {
DOMUtils.deleteNode(chosenVideoSourceEl)
chosenVideoSourceEl = videoSourceEl
bestWidthDiff = widthDiff
return
}
}
}

// Delete all other nodes
DOMUtils.deleteNode(videoSourceEl)
})

Check notice on line 185 in src/renderers/abstract.renderer.ts

codefactor.io / CodeFactor

src/renderers/abstract.renderer.ts#L123-L185

Complex Method

return chosenVideoSourceEl
}
@@ -698,6 +698,14 @@
}
})

/*
* Because of CSP, some ZIM reader environments do not allow inline JS. See issues/2096.
*/
const scripts = Array.from(parsoidDoc.getElementsByTagName('script')) as DominoElement[]
audiodude marked this conversation as resolved.
Show resolved Hide resolved
for (const script of scripts) {
script.parentNode.removeChild(script)
}

/* Force display of element with that CSS class */
filtersConfig.cssClassDisplayList.map((classname: string) => {
const nodes: DominoElement[] = Array.from(parsoidDoc.getElementsByClassName(classname))
84 changes: 57 additions & 27 deletions test/unit/saveArticles.test.ts
Original file line number Diff line number Diff line change
@@ -212,6 +212,36 @@ describe('saveArticles', () => {
// Prague was correctly post-processed
expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined()
})

test('Removes inline JS', async () => {
const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org' }) // en wikipedia
downloader.setUrlsDirectors(rendererInstance, rendererInstance)
const articleId = 'Potato'
const articleUrl = downloader.getArticleUrl(articleId)
const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId])
const articlesDetail = mwRetToArticleDetail(_articleDetailsRet)
const { articleDetailXId } = RedisStore
const articleDetail = { title: articleId, timestamp: '2023-08-20T14:54:01Z' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)
articleDetailXId.setMany(articlesDetail)
const result = await downloader.getArticle(
downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
rendererInstance,
articleUrl,
dump,
articleDetail,
dump.isMainPage(articleId),
)

const articleDoc = domino.createDocument(result[0].html)

// Document has scripts that we added, but shouldn't have any without a `src` (inline).
const remainingInlineScripts = Array.from(articleDoc.querySelectorAll('script:not([src])'))
expect(remainingInlineScripts.length).toBe(0)
})
}

describe('applyOtherTreatments', () => {
@@ -280,37 +310,37 @@ describe('saveArticles', () => {
expect(fewestChildren).toBeLessThanOrEqual(1)
})
*/
})

test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'
test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'

const articleJsonObject = {
visualeditor: { oldid: 0 },
}
const articleJsonObject = {
visualeditor: { oldid: 0 },
}

const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}
const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
})
})

test('Load inline js from HTML', async () => {

Unchanged files with check annotations Beta

import { execa } from 'execa'

Check warning on line 1 in test/e2e/vikidia.e2e.test.ts

GitHub Actions / ci-test (18.x)

'execa' is defined but never used
import rimraf from 'rimraf'

Check warning on line 2 in test/e2e/vikidia.e2e.test.ts

GitHub Actions / ci-test (18.x)

'rimraf' is defined but never used
import { testRenders } from '../testRenders.js'

Check warning on line 3 in test/e2e/vikidia.e2e.test.ts

GitHub Actions / ci-test (18.x)

'testRenders' is defined but never used
import 'dotenv/config.js'
import { jest } from '@jest/globals'
import { zimcheck } from '../util.js'

Check warning on line 6 in test/e2e/vikidia.e2e.test.ts

GitHub Actions / ci-test (18.x)

'zimcheck' is defined but never used
jest.setTimeout(200000)
const parameters = {

Check warning on line 10 in test/e2e/vikidia.e2e.test.ts

GitHub Actions / ci-test (18.x)

'parameters' is assigned a value but never used
mwUrl: 'https://en.vikidia.org',
adminEmail: 'test@kiwix.org',
redis: process.env.REDIS,
import * as domino from 'domino'
import { WikimediaMobileRenderer } from '../../../src/renderers/wikimedia-mobile.renderer'
import exp from 'constants'

Check warning on line 4 in test/unit/renderers/mobile.renderer.test.ts

GitHub Actions / ci-test (18.x)

'exp' is defined but never used
describe('mobile renderer', () => {
describe('unhiding sections', () => {