From 15a9e0ae0d701eeca6e67567c974f7a9df8183c0 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Wed, 4 Dec 2024 17:28:19 -0800 Subject: [PATCH 1/2] Remove inline scripts from scraped articles --- src/renderers/abstract.renderer.ts | 8 +++ test/unit/saveArticles.test.ts | 84 ++++++++++++++++++++---------- 2 files changed, 65 insertions(+), 27 deletions(-) diff --git a/src/renderers/abstract.renderer.ts b/src/renderers/abstract.renderer.ts index 5dca34f8..039a7e93 100644 --- a/src/renderers/abstract.renderer.ts +++ b/src/renderers/abstract.renderer.ts @@ -698,6 +698,14 @@ export abstract class Renderer { } }) + /* + * Because of CSP, some ZIM reader environments do not allow inline JS. See issues/2096. + */ + const scripts = Array.from(parsoidDoc.getElementsByTagName('script')) as DominoElement[] + for (const script of scripts) { + script.parentNode.removeChild(script) + } + /* Force display of element with that CSS class */ filtersConfig.cssClassDisplayList.map((classname: string) => { const nodes: DominoElement[] = Array.from(parsoidDoc.getElementsByClassName(classname)) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 48fc593e..e1682782 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -212,6 +212,36 @@ describe('saveArticles', () => { // Prague was correctly post-processed expect(PragueDocument.querySelector('#POST_PROCESSOR')).toBeDefined() }) + + test('Removes inline JS', async () => { + const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org' }) // en wikipedia + downloader.setUrlsDirectors(rendererInstance, rendererInstance) + const articleId = 'Potato' + const articleUrl = downloader.getArticleUrl(articleId) + const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) + const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) + const { articleDetailXId } = RedisStore + const articleDetail = { title: articleId, timestamp: '2023-08-20T14:54:01Z' } + const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) + articleDetailXId.setMany(articlesDetail) + const result = await downloader.getArticle( + downloader.webp, + _moduleDependencies, + articleId, + articleDetailXId, + rendererInstance, + articleUrl, + dump, + articleDetail, + dump.isMainPage(articleId), + ) + + const articleDoc = domino.createDocument(result[0].html) + + // Document has scripts that we added, but shouldn't have any with a `src`. + const remainingInlineScripts = Array.from(articleDoc.querySelectorAll('script:not([src])')) + expect(remainingInlineScripts.length).toBe(0) + }) } describe('applyOtherTreatments', () => { @@ -280,37 +310,37 @@ describe('saveArticles', () => { expect(fewestChildren).toBeLessThanOrEqual(1) }) */ - }) - test('Test deleted article rendering (Visual editor renderer)', async () => { - const { downloader, dump } = await setupScrapeClasses() // en wikipedia - const { articleDetailXId } = RedisStore - const articleId = 'deletedArticle' + test('Test deleted article rendering (Visual editor renderer)', async () => { + const { downloader, dump } = await setupScrapeClasses() // en wikipedia + const { articleDetailXId } = RedisStore + const articleId = 'deletedArticle' - const articleJsonObject = { - visualeditor: { oldid: 0 }, - } + const articleJsonObject = { + visualeditor: { oldid: 0 }, + } - const articleDetail = { title: articleId, missing: '' } - const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) - - const visualEditorRenderer = new VisualEditorRenderer() - - const renderOpts = { - data: articleJsonObject, - RedisStore, - webp: downloader.webp, - _moduleDependencies, - articleId, - articleDetailXId, - articleDetail, - isMainPage: dump.isMainPage(articleId), - dump, - } + const articleDetail = { title: articleId, missing: '' } + const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) - expect(async () => { - await visualEditorRenderer.render(renderOpts) - }).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR)) + const visualEditorRenderer = new VisualEditorRenderer() + + const renderOpts = { + data: articleJsonObject, + RedisStore, + webp: downloader.webp, + _moduleDependencies, + articleId, + articleDetailXId, + articleDetail, + isMainPage: dump.isMainPage(articleId), + dump, + } + + expect(async () => { + await visualEditorRenderer.render(renderOpts) + }).rejects.toThrow(new Error(DELETED_ARTICLE_ERROR)) + }) }) test('Load inline js from HTML', async () => { From ba5e4e126b7e02cc292b4920116e401559f82792 Mon Sep 17 00:00:00 2001 From: Travis Briggs Date: Sat, 14 Dec 2024 08:25:24 -0800 Subject: [PATCH 2/2] Update comment --- test/unit/saveArticles.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index e1682782..59fbcab0 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -238,7 +238,7 @@ describe('saveArticles', () => { const articleDoc = domino.createDocument(result[0].html) - // Document has scripts that we added, but shouldn't have any with a `src`. + // Document has scripts that we added, but shouldn't have any without a `src` (inline). const remainingInlineScripts = Array.from(articleDoc.querySelectorAll('script:not([src])')) expect(remainingInlineScripts.length).toBe(0) })