diff --git a/src/Downloader.ts b/src/Downloader.ts index 7b66d420..cbb83da9 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -181,12 +181,14 @@ class Downloader { public async setBaseUrls() { //* Objects order in array matters! this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([ + { condition: await MediaWiki.hasMediawikiParsoidApi(), value: MediaWiki.useParsoidApiUrl.href }, { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, ]) //* Objects order in array matters! this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([ + { condition: await MediaWiki.hasMediawikiParsoidApi(), value: MediaWiki.useParsoidApiUrl.href }, { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, ]) diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index d84a10dc..e587058d 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -11,6 +11,7 @@ import BaseURLDirector from './util/builders/url/base.director.js' import ApiURLDirector from './util/builders/url/api.director.js' import DesktopURLDirector from './util/builders/url/desktop.director.js' import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js' +import UseParsoidURLDirector from './util/builders/url/use-parsoid.director.js' import { checkApiAvailability } from './util/mw-api.js' class MediaWiki { @@ -40,8 +41,10 @@ class MediaWiki { private apiUrlDirector: ApiURLDirector private wikimediaDesktopUrlDirector: DesktopURLDirector private visualEditorURLDirector: VisualEditorURLDirector + private useParsoidURLDirector: UseParsoidURLDirector public visualEditorApiUrl: URL + public useParsoidApiUrl: URL public apiUrl: URL public modulePath: string // only for reading public _modulePathOpt: string // only for whiting to generate modulePath @@ -50,6 +53,7 @@ class MediaWiki { #hasWikimediaDesktopRestApi: boolean | null #hasVisualEditorApi: boolean | null + #hasMediawikiParsoidApi: boolean | null set username(value: string) { this.#username = value @@ -99,6 +103,7 @@ class MediaWiki { this.#hasWikimediaDesktopRestApi = null this.#hasVisualEditorApi = null + this.#hasMediawikiParsoidApi = null } private constructor() { @@ -121,16 +126,26 @@ class MediaWiki { return this.#hasVisualEditorApi } + public async hasMediawikiParsoidApi(): Promise { + if (this.#hasMediawikiParsoidApi === null) { + this.#hasMediawikiParsoidApi = await checkApiAvailability(this.useParsoidURLDirector.buildArticleURL(this.apiCheckArticleId)) + return this.#hasMediawikiParsoidApi + } + return this.#hasMediawikiParsoidApi + } + private initMWApis() { const baseUrlDirector = new BaseURLDirector(this.baseUrl.href) this.webUrl = baseUrlDirector.buildURL(this.#wikiPath) this.apiUrl = baseUrlDirector.buildURL(this.#apiPath) this.apiUrlDirector = new ApiURLDirector(this.apiUrl.href) this.visualEditorApiUrl = this.apiUrlDirector.buildVisualEditorURL() + this.useParsoidApiUrl = this.apiUrlDirector.buildUseParsoidURL() this.desktopRestApiUrl = baseUrlDirector.buildDesktopRestApiURL(this.#restApiPath) this.modulePath = baseUrlDirector.buildModuleURL(this._modulePathOpt) this.wikimediaDesktopUrlDirector = new DesktopURLDirector(this.desktopRestApiUrl.href) this.visualEditorURLDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href) + this.useParsoidURLDirector = new UseParsoidURLDirector(this.useParsoidApiUrl.href) } public async login(downloader: Downloader) { diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 57467def..10ddad57 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -209,6 +209,7 @@ async function execute(argv: any) { } MediaWiki.apiCheckArticleId = mwMetaData.mainPage + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() diff --git a/src/util/builders/url/api.director.ts b/src/util/builders/url/api.director.ts index 477ee286..f2092a06 100644 --- a/src/util/builders/url/api.director.ts +++ b/src/util/builders/url/api.director.ts @@ -48,6 +48,10 @@ export default class ApiURLDirector { return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', page: '' }).build(true) } + buildUseParsoidURL() { + return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'parse', format: 'json', prop: 'text|modules|jsconfigvars|headhtml', parsoid: '1', page: '' }).build(true) + } + buildArticleApiURL(articleId: string) { const domain = this.buildBaseArticleURL() diff --git a/src/util/builders/url/use-parsoid.director.ts b/src/util/builders/url/use-parsoid.director.ts new file mode 100644 index 00000000..8f74de47 --- /dev/null +++ b/src/util/builders/url/use-parsoid.director.ts @@ -0,0 +1,19 @@ +import urlBuilder from './url.builder.js' + +/** + * Interface to build URLs based on MediaWiki visual editor URL + */ +export default class UseParsoidURLDirector { + baseDomain: string + + constructor(baseDomain: string) { + this.baseDomain = baseDomain + } + + buildArticleURL(articleId: string) { + return urlBuilder + .setDomain(this.baseDomain) + .setQueryParams({ page: encodeURIComponent(articleId) }, '&') + .build() + } +} diff --git a/src/util/mw-api.ts b/src/util/mw-api.ts index 04386fe9..a036bf5b 100644 --- a/src/util/mw-api.ts +++ b/src/util/mw-api.ts @@ -259,7 +259,7 @@ export function mwRetToArticleDetail(obj: QueryMwRet): KVS { export async function checkApiAvailability(url: string, loginCookie = ''): Promise { try { const resp = await axios.get(url, { maxRedirects: 0, headers: { cookie: loginCookie } }) - return resp.status === 200 && !resp.headers['mediawiki-api-error'] + return resp.status === 200 && !resp.headers['mediawiki-api-error'] && !(resp.data.warnings?.main['*'] === 'Unrecognized parameter: parsoid.') } catch (err) { return false } diff --git a/src/util/renderers/abstract.renderer.ts b/src/util/renderers/abstract.renderer.ts index a0ed8804..8618b382 100644 --- a/src/util/renderers/abstract.renderer.ts +++ b/src/util/renderers/abstract.renderer.ts @@ -1,5 +1,5 @@ type renderType = 'auto' | 'desktop' | 'mobile' | 'specific' -type renderName = 'VisualEditor' | 'WikimediaDesktop' | 'WikimediaMobile' +type renderName = 'VisualEditor' | 'WikimediaDesktop' | 'WikimediaMobile' | 'MediawikiParsoid' interface RendererBuilderOptionsBase { renderType: renderType diff --git a/src/util/renderers/mediawiki-parsoid-renderer.ts b/src/util/renderers/mediawiki-parsoid-renderer.ts new file mode 100644 index 00000000..f168a020 --- /dev/null +++ b/src/util/renderers/mediawiki-parsoid-renderer.ts @@ -0,0 +1,105 @@ +import domino from 'domino' +import { DELETED_ARTICLE_ERROR } from '../const.js' +import * as logger from '../../Logger.js' +import { Renderer } from './abstract.renderer.js' +import { getStrippedTitleFromHtml } from '../misc.js' +import { RenderOpts } from './abstract.renderer.js' + +/* +Represent 'https://{wikimedia-wiki}/w/api.php?action=parse&format=json&prop=text|revid|modules|jsconfigvars|headhtml|text&parsoid=1&formatversion=2&page={title}' +or +https://{3rd-part-wikimedia-wiki}/w/api.php?action=parse&format=json&prop=text|revid|modules|jsconfigvars|headhtml|text&parsoid=1&formatversion=2&page={title} +*/ +export class MediawikiParsoidRenderer extends Renderer { + constructor() { + super() + } + + public async render(renderOpts: RenderOpts): Promise { + const { data, isMainPage, articleId, articleDetail } = renderOpts + + if (!data) { + throw new Error(`Cannot render [${data}] into an article`) + } + + let strippedTitle: string + const result = [] + if (data.parse) { + // Testing if article has been deleted between fetching list and downloading content. + if (data.parse.revid === 0) { + logger.error(DELETED_ARTICLE_ERROR) + throw new Error(DELETED_ARTICLE_ERROR) + } + const dataHtml = isMainPage ? this.removeNoscript(data.parse.text['*']) : this.injectHeader(this.removeNoscript(data.parse.text['*']), articleDetail) + strippedTitle = getStrippedTitleFromHtml(dataHtml) + result.push({ + articleId, + displayTitle: strippedTitle || articleId.replace('_', ' '), + html: dataHtml, + modules: data.parse.modules || '', + modulescripts: data.parse.modulescripts || '', + modulestyles: data.parse.modulestyles || '', + headhtml: data.parse.headhtml['*'] || '', + }) + return result + } else if (data.error) { + logger.error(`Error in retrieved article [${articleId}]:`, data.error) + return '' + } + logger.error('Unable to parse data from mediawiki parsoid') + return '' + } + + // TODO: this was moved to the abstract renderer in PR1886 + private injectHeader(content: string, articleDetail: any): string { + const doc = domino.createDocument(content) + const header = doc.createElement('h1') + + if (articleDetail?.title) { + header.appendChild(doc.createTextNode(articleDetail.title)) + header.classList.add('article-header') + + const target = doc.querySelector('body.mw-body-content') || doc.querySelector('body') + + if (target) { + target.insertAdjacentElement('afterbegin', header) + } + } + + return doc.documentElement.outerHTML + } + + // Remove noscript elements but preserve inner content + private removeNoscript(content: string) { + const doc = domino.createDocument(content) + const noscriptNodes = Array.from(doc.querySelectorAll('noscript')) + + if (noscriptNodes && noscriptNodes.length > 0) { + noscriptNodes.forEach((noscriptEl) => { + const noscriptElParent = noscriptEl.parentNode + + if (noscriptElParent) { + // Transfer noscript children into the parent node + while (noscriptEl.firstChild) { + if (noscriptEl.firstChild.nodeType === doc.TEXT_NODE) { + const domElem = domino.createDocument(noscriptEl.innerHTML).documentElement + // Remove any text content as it's no longer needed + noscriptEl.removeChild(noscriptEl.firstChild) + // Retrieve img from noscript + const imgs = Array.from(domElem.querySelectorAll('img')) + imgs.forEach((img) => { + noscriptEl.appendChild(img) + }) + } + noscriptElParent.insertBefore(noscriptEl.firstChild, noscriptEl) + } + + // Remove noscript along with children + noscriptElParent.removeChild(noscriptEl) + } + }) + } + + return doc.documentElement.outerHTML + } +} diff --git a/src/util/renderers/renderer.builder.ts b/src/util/renderers/renderer.builder.ts index 257c6a99..e7abc6f7 100644 --- a/src/util/renderers/renderer.builder.ts +++ b/src/util/renderers/renderer.builder.ts @@ -2,6 +2,7 @@ import MediaWiki from './../../MediaWiki.js' import { Renderer } from './abstract.renderer.js' import { VisualEditorRenderer } from './visual-editor.renderer.js' import { WikimediaDesktopRenderer } from './wikimedia-desktop.renderer.js' +import { MediawikiParsoidRenderer } from './mediawiki-parsoid-renderer.js' import { RendererBuilderOptions } from './abstract.renderer.js' import * as logger from './../../Logger.js' @@ -9,11 +10,17 @@ export class RendererBuilder { public async createRenderer(options: RendererBuilderOptions): Promise { const { renderType, renderName } = options - const [hasVisualEditorApi, hasWikimediaDesktopRestApi] = await Promise.all([MediaWiki.hasVisualEditorApi(), MediaWiki.hasWikimediaDesktopRestApi()]) + const [hasVisualEditorApi, hasWikimediaDesktopRestApi, hasMediawikiParsoidApi] = await Promise.all([ + MediaWiki.hasVisualEditorApi(), + MediaWiki.hasWikimediaDesktopRestApi(), + MediaWiki.hasMediawikiParsoidApi(), + ]) switch (renderType) { case 'desktop': - if (hasWikimediaDesktopRestApi) { + if (hasMediawikiParsoidApi) { + return new MediawikiParsoidRenderer() + } else if (hasWikimediaDesktopRestApi) { // Choose WikimediaDesktopRenderer if it's present, regardless of hasVisualEditorApi value return new WikimediaDesktopRenderer() } else if (hasVisualEditorApi) { @@ -26,7 +33,9 @@ export class RendererBuilder { // TODO: return WikimediaMobile renderer break case 'auto': - if (hasWikimediaDesktopRestApi) { + if (hasMediawikiParsoidApi) { + return new MediawikiParsoidRenderer() + } else if (hasWikimediaDesktopRestApi) { // Choose WikimediaDesktopRenderer if it's present, regardless of hasVisualEditorApi value return new WikimediaDesktopRenderer() } else if (hasVisualEditorApi) { @@ -38,6 +47,12 @@ export class RendererBuilder { case 'specific': // renderName argument is required for 'specific' mode switch (renderName) { + case 'MediawikiParsoid': + if (hasMediawikiParsoidApi) { + return new MediawikiParsoidRenderer() + } + logger.error('Cannot create an instance of MediawikiParsoid renderer.') + process.exit(1) case 'WikimediaDesktop': if (hasWikimediaDesktopRestApi) { return new WikimediaDesktopRenderer() diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 76b1c5b4..5d3ed258 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -17,6 +17,7 @@ import articleTreatment from './treatments/article.treatment.js' import urlHelper from './url.helper.js' import { RendererBuilderOptions, Renderer } from './renderers/abstract.renderer.js' import { RendererBuilder } from './renderers/renderer.builder.js' +import { MediawikiParsoidRenderer } from './renderers/mediawiki-parsoid-renderer.js' const genericJsModules = config.output.mw.js const genericCssModules = config.output.mw.css @@ -296,21 +297,32 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade let rets: any try { const articleUrl = getArticleUrl(downloader, dump, articleId) + let isRenderReturnModules: boolean if (dump.isMainPage) { + /* + Check whether the renderer API can download modules along with article text content. + Only MediawikiParsoidRenderer has this feature. + */ + isRenderReturnModules = mainPageRenderer instanceof MediawikiParsoidRenderer rets = await downloader.getArticle(articleId, articleDetailXId, mainPageRenderer, articleUrl, articleDetail, dump.isMainPage(articleId)) } + isRenderReturnModules = articlesRenderer instanceof MediawikiParsoidRenderer rets = await downloader.getArticle(articleId, articleDetailXId, articlesRenderer, articleUrl, articleDetail, dump.isMainPage(articleId)) - for (const { articleId, displayTitle: articleTitle, html: articleHtml } of rets) { + for (const { articleId, displayTitle: articleTitle, html: articleHtml, modules, modulescripts, modulestyles, headhtml } of rets) { const nonPaginatedArticleId = articleDetail.title + const modulesData = { modules, modulescripts, modulestyles, headhtml } + if (!articleHtml) { logger.warn(`No HTML returned for article [${articleId}], skipping`) continue } curStage += 1 - const _moduleDependencies = await getModuleDependencies(nonPaginatedArticleId, downloader) + + const _moduleDependencies = await getModuleDependencies(nonPaginatedArticleId, downloader, isRenderReturnModules, modulesData) + for (const dep of _moduleDependencies.jsDependenciesList) { jsModuleDependencies.add(dep) } @@ -401,37 +413,56 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade } } -export async function getModuleDependencies(articleId: string, downloader: Downloader) { +export async function getModuleDependencies(articleId: string, downloader: Downloader, isRenderReturnModules: boolean, modulesData: object) { /* These vars will store the list of js and css dependencies for the article we are downloading. */ let jsConfigVars = '' let jsDependenciesList: string[] = [] let styleDependenciesList: string[] = [] - const apiUrlDirector = new ApiURLDirector(MediaWiki.apiUrl.href) + let moduleObj = { + modules: '', + modulescripts: '', + modulestyles: '', + headhtml: '', + } - const articleApiUrl = apiUrlDirector.buildArticleApiURL(articleId) + if (!isRenderReturnModules) { + const apiUrlDirector = new ApiURLDirector(MediaWiki.apiUrl.href) - const articleData = await downloader.getJSON(articleApiUrl) + const articleApiUrl = apiUrlDirector.buildArticleApiURL(articleId) - if (articleData.error) { - const errorMessage = `Unable to retrieve js/css dependencies for article '${articleId}': ${articleData.error.code}` - logger.error(errorMessage) + const articleData = await downloader.getJSON(articleApiUrl) - /* If article is missing (for example because it just has been deleted) */ - if (articleData.error.code === 'missingtitle') { - return { jsConfigVars, jsDependenciesList, styleDependenciesList } + if (articleData.error) { + const errorMessage = `Unable to retrieve js/css dependencies for article '${articleId}': ${articleData.error.code}` + logger.error(errorMessage) + + /* If article is missing (for example because it just has been deleted) */ + if (articleData.error.code === 'missingtitle') { + return { jsConfigVars, jsDependenciesList, styleDependenciesList } + } + + /* Something went wrong in modules retrieval at app level (no HTTP error) */ + throw new Error(errorMessage) } - /* Something went wrong in modules retrieval at app level (no HTTP error) */ - throw new Error(errorMessage) + const { + parse: { modules, modulescripts, modulestyles, headhtml }, + } = articleData + + moduleObj = { + modules, + modulescripts, + modulestyles, + headhtml, + } + } else { + moduleObj = { ...moduleObj, ...modulesData } } - const { - parse: { modules, modulescripts, modulestyles, headhtml }, - } = articleData - jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a) - styleDependenciesList = [].concat(modules, modulestyles, genericCssModules).filter((a) => a) + jsDependenciesList = genericJsModules.concat(moduleObj.modules, moduleObj.modulescripts).filter((a) => a) + styleDependenciesList = [].concat(moduleObj.modules, moduleObj.modulestyles, genericCssModules).filter((a) => a) styleDependenciesList = styleDependenciesList.filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep)) logger.info(`Js dependencies of ${articleId} : ${jsDependenciesList}`) @@ -439,7 +470,7 @@ export async function getModuleDependencies(articleId: string, downloader: Downl // Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page // the script below extracts the config with a regex executed on the page header returned from the api - const scriptTags = domino.createDocument(`${headhtml['*']}`).getElementsByTagName('script') + const scriptTags = domino.createDocument(`${moduleObj.headhtml['*']}`).getElementsByTagName('script') const regex = /mw\.config\.set\(\{.*?\}\);/gm // eslint-disable-next-line @typescript-eslint/prefer-for-of for (let i = 0; i < scriptTags.length; i += 1) { diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index a0204446..2e3d90d5 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -30,6 +30,7 @@ describe('Downloader class', () => { downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }) await MediaWiki.getMwMetaData(downloader) + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.setBaseUrls() diff --git a/test/unit/mwApi.test.ts b/test/unit/mwApi.test.ts index 5bc05cd2..0c6e9181 100644 --- a/test/unit/mwApi.test.ts +++ b/test/unit/mwApi.test.ts @@ -23,6 +23,7 @@ describe('mwApi', () => { downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) await MediaWiki.getMwMetaData(downloader) + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.checkCoordinatesAvailability() diff --git a/test/unit/renderers/renderer.builder.test.ts b/test/unit/renderers/renderer.builder.test.ts index 01606307..622fb9f9 100644 --- a/test/unit/renderers/renderer.builder.test.ts +++ b/test/unit/renderers/renderer.builder.test.ts @@ -80,6 +80,7 @@ describe('RendererBuilder', () => { it('should throw an error for unknown RendererAPI in specific mode', async () => { const { downloader, MediaWiki } = await setupScrapeClasses() // en wikipedia + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.setBaseUrls() diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 6e426ebe..ba2ea531 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -18,6 +18,7 @@ describe('saveArticles', () => { test('Article html processing', async () => { const { MediaWiki, downloader, dump } = await setupScrapeClasses() // en wikipedia + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.setBaseUrls() @@ -210,7 +211,7 @@ describe('saveArticles', () => { test('Load inline js from HTML', async () => { const { downloader } = await setupScrapeClasses() // en wikipedia - const _moduleDependencies = await getModuleDependencies('Potato', downloader) + const _moduleDependencies = await getModuleDependencies('Potato', downloader, false, {}) // next variables declared to avoid "variable is not defined" errors let RLCONF: any // eslint-disable-next-line @typescript-eslint/no-unused-vars diff --git a/test/unit/urlRewriting.test.ts b/test/unit/urlRewriting.test.ts index e5c28219..c3efa2ac 100644 --- a/test/unit/urlRewriting.test.ts +++ b/test/unit/urlRewriting.test.ts @@ -138,6 +138,7 @@ describe('Styles', () => { await articleDetailXId.flush() await redisStore.redirectsXId.flush() const { MediaWiki, downloader, dump } = await setupScrapeClasses() // en wikipedia + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.checkCoordinatesAvailability() diff --git a/test/util.ts b/test/util.ts index 6ba396a8..b24df51f 100644 --- a/test/util.ts +++ b/test/util.ts @@ -36,6 +36,7 @@ export async function setupScrapeClasses({ mwUrl = 'https://en.wikipedia.org', f const downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) await MediaWiki.getMwMetaData(downloader) + await MediaWiki.hasMediawikiParsoidApi() await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() await downloader.checkCoordinatesAvailability()