Skip to content

Commit

Permalink
Remove inline scripts from scraped articles
Browse files Browse the repository at this point in the history
  • Loading branch information
audiodude authored and kelson42 committed Dec 10, 2024
1 parent ee070cf commit 7276d45
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 27 deletions.
8 changes: 8 additions & 0 deletions src/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,14 @@ export abstract class Renderer {
}
})

/*
* Because of CSP, some ZIM reader environments do not allow inline JS. See issues/2096.
*/
const scripts = Array.from(parsoidDoc.getElementsByTagName('script')) as DominoElement[]
for (const script of scripts) {
script.parentNode.removeChild(script)
}

/* Force display of element with that CSS class */
filtersConfig.cssClassDisplayList.map((classname: string) => {
const nodes: DominoElement[] = Array.from(parsoidDoc.getElementsByClassName(classname))
Expand Down
84 changes: 57 additions & 27 deletions test/unit/saveArticles.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,36 @@ describe('saveArticles', () => {
// Prague was correctly post-processed
expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined()
})

test('Removes inline JS', async () => {
const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org' }) // en wikipedia
downloader.setUrlsDirectors(rendererInstance, rendererInstance)
const articleId = 'Potato'
const articleUrl = downloader.getArticleUrl(articleId)
const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId])
const articlesDetail = mwRetToArticleDetail(_articleDetailsRet)
const { articleDetailXId } = RedisStore
const articleDetail = { title: articleId, timestamp: '2023-08-20T14:54:01Z' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)
articleDetailXId.setMany(articlesDetail)
const result = await downloader.getArticle(
downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
rendererInstance,
articleUrl,
dump,
articleDetail,
dump.isMainPage(articleId),
)

const articleDoc = domino.createDocument(result[0].html)

// Document has scripts that we added, but shouldn't have any with a `src`.
const remainingInlineScripts = Array.from(articleDoc.querySelectorAll('script:not([src])'))
expect(remainingInlineScripts.length).toBe(0)
})
}

describe('applyOtherTreatments', () => {
Expand Down Expand Up @@ -280,37 +310,37 @@ describe('saveArticles', () => {
expect(fewestChildren).toBeLessThanOrEqual(1)
})
*/
})

test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'
test('Test deleted article rendering (Visual editor renderer)', async () => {
const { downloader, dump } = await setupScrapeClasses() // en wikipedia
const { articleDetailXId } = RedisStore
const articleId = 'deletedArticle'

const articleJsonObject = {
visualeditor: { oldid: 0 },
}
const articleJsonObject = {
visualeditor: { oldid: 0 },
}

const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}
const articleDetail = { title: articleId, missing: '' }
const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title)

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
const visualEditorRenderer = new VisualEditorRenderer()

const renderOpts = {
data: articleJsonObject,
RedisStore,
webp: downloader.webp,
_moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
isMainPage: dump.isMainPage(articleId),
dump,
}

expect(async () => {
await visualEditorRenderer.render(renderOpts)
}).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR))
})
})

test('Load inline js from HTML', async () => {
Expand Down

0 comments on commit 7276d45

Please sign in to comment.