diff --git a/src/Downloader.ts b/src/Downloader.ts index eb5c1504..d3d63e1a 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -14,6 +14,7 @@ import imageminWebp from 'imagemin-webp' import sharp from 'sharp' import http from 'http' import https from 'https' +import semver from 'semver' import { normalizeMwResponse, @@ -78,6 +79,7 @@ export interface MWCapabilities { coordinatesAvailable: boolean desktopRestApiAvailable: boolean mobileRestApiAvailable: boolean + useParsoidApiAvailable: boolean } export const defaultStreamRequestOptions: AxiosRequestConfig = { @@ -130,6 +132,7 @@ class Downloader { coordinatesAvailable: true, desktopRestApiAvailable: false, mobileRestApiAvailable: false, + useParsoidApiAvailable: false, } this.backoffOptions = { @@ -216,7 +219,12 @@ class Downloader { } public async setBaseUrls() { - if (this.mwCapabilities.desktopRestApiAvailable) { + if (this.mwCapabilities.useParsoidApiAvailable) { + this.baseUrl = { + url: this.mw.useParsoidApiUrl.href, + type: ApiUrlType.UseParsoid, + } + } else if (this.mwCapabilities.desktopRestApiAvailable) { this.baseUrl = { url: this.mw.desktopRestApiUrl.href, type: ApiUrlType.DesktopRest, @@ -236,7 +244,7 @@ class Downloader { // never use it for the main page, but use it for all the other pages. this.baseUrlForMainPage = this.baseUrl - if (this.mwCapabilities.mobileRestApiAvailable) { + if (this.mwCapabilities.mobileRestApiAvailable && this.baseUrl.type !== ApiUrlType.UseParsoid) { this.baseUrl = { url: this.mw.mobileRestApiUrl.href, type: ApiUrlType.MobileRest, @@ -264,6 +272,7 @@ class Downloader { // accordingly. We need to set a default page (always there because // installed per default) to request the REST API, otherwise it would // fail the check. + this.mwCapabilities.useParsoidApiAvailable = semver.satisfies(this.mw.metaData.mwVersion, '>=1.41.0') this.mwCapabilities.mobileRestApiAvailable = await this.checkApiAvailabilty(this.mw.getMobileRestApiArticleUrl(testArticleId)) this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(this.mw.getDesktopRestApiArticleUrl(testArticleId)) this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(this.mw.getVeApiArticleUrl(testArticleId)) @@ -389,8 +398,7 @@ class Downloader { } public async getArticle(articleId: string, dump: Dump, articleDetailXId: RKVS, articleDetail?: ArticleDetail): Promise { - const isMainPage = dump.isMainPage(articleId) - const articleApiUrl: ApiUrl = this.getArticleApiUrl(articleId, isMainPage) + const articleApiUrl: ApiUrl = this.getArticleApiUrl(articleId, dump) logger.info(`Getting article [${articleId}] from ${articleApiUrl.url} with type ${articleApiUrl.type}`) @@ -469,7 +477,8 @@ class Downloader { } } - private getArticleApiUrl(articleId: string, isMainPage: boolean): ApiUrl { + public getArticleApiUrl(articleId: string, dump: Dump): ApiUrl { + const isMainPage = dump.isMainPage(articleId) const apiUrl = isMainPage ? this.baseUrlForMainPage : this.baseUrl return { url: `${apiUrl.url}${encodeURIComponent(articleId)}`, diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index fa899f4b..3078a173 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -19,6 +19,7 @@ class MediaWiki { public readonly restApiUrl: URL public readonly mobileRestApiUrl: URL public readonly desktopRestApiUrl: URL + public readonly useParsoidApiUrl: URL public readonly getCategories: boolean public readonly namespaces: MWNamespaces = {} public readonly namespacesToMirror: string[] = [] @@ -29,6 +30,7 @@ class MediaWiki { private readonly apiPath: string private readonly domain: string private readonly articleApiUrlBase: string + private readonly articleParsoidApiUrlBase: string constructor(config: MWConfig) { this.domain = config.domain || '' @@ -49,9 +51,11 @@ class MediaWiki { this.restApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1', this.baseUrl.href).toString(), '/')) this.mobileRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/mobile-sections', this.baseUrl.href).toString(), '/')) this.desktopRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/html', this.baseUrl.href).toString(), '/')) + this.useParsoidApiUrl = new URL(`${this.apiUrl.href}action=parse&format=json&prop=text|revid&parsoid=1&formatversion=2&page=`) this.modulePath = `${urlParser.resolve(this.baseUrl.href, config.modulePath ?? 'w/load.php')}?` this.articleApiUrlBase = `${this.apiUrl.href}action=parse&format=json&prop=${encodeURI('modules|jsconfigvars|headhtml')}&page=` + this.articleParsoidApiUrlBase = `${this.apiUrl.href}action=parse&format=json&prop=${encodeURI('modules|jsconfigvars|headhtml|text')}&useparsoid=1&page=` } public async login(downloader: Downloader) { @@ -101,8 +105,8 @@ class MediaWiki { return `${this.apiUrl.href}action=query&meta=siteinfo&format=json` } - public articleApiUrl(articleId: string): string { - return `${this.articleApiUrlBase}${encodeURIComponent(articleId)}` + public articleApiUrl(articleId: string, useParsoid: boolean): string { + return `${useParsoid ? this.articleParsoidApiUrlBase : this.articleApiUrlBase}${encodeURIComponent(articleId)}` } public subCategoriesApiUrl(articleId: string, continueStr = '') { @@ -278,6 +282,7 @@ class MediaWiki { siteName, langIso2, langIso3, + mwVersion, } } @@ -297,7 +302,7 @@ class MediaWiki { const creator = this.getCreatorName() || 'Kiwix' - const [textDir, { langIso2, langIso3, mainPage, siteName }, subTitle] = await Promise.all([ + const [textDir, { langIso2, langIso3, mainPage, siteName, mwVersion }, subTitle] = await Promise.all([ this.getTextDirection(downloader), this.getSiteInfo(downloader), this.getSubTitle(downloader), @@ -320,6 +325,7 @@ class MediaWiki { subTitle, creator, mainPage, + mwVersion, } this.metaData = mwMetaData diff --git a/src/types.d.ts b/src/types.d.ts index 50ffb815..27eed0ac 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -156,6 +156,7 @@ interface MWMetaData { creator: string mainPage: string textDir: TextDirection + mwVersion: string baseUrl: string wikiPath: string diff --git a/src/util/articleRenderers.ts b/src/util/articleRenderers.ts index 8c0bb255..650ba49f 100644 --- a/src/util/articleRenderers.ts +++ b/src/util/articleRenderers.ts @@ -8,6 +8,7 @@ import { DELETED_ARTICLE_ERROR } from './const.js' export enum ApiUrlType { Unknown = 'unknown', + UseParsoid = 'useParsoid', MobileRest = 'mobileRest', DesktopRest = 'desktopRest', VE = 've', @@ -29,6 +30,17 @@ export const renderArticle = async ( const articleDetail = articleDetailIn || (await articleDetailXId.get(articleId)) const isMainPage = dump.isMainPage(articleId) + if (json.type == ApiUrlType.UseParsoid) { + const html = renderParsoidArticle(json, articleId, articleDetail, isMainPage) + return [ + { + articleId, + displayTitle: articleId.replace('_', ' '), + html, + }, + ] + } + // Main Page is never ApiUrlType.MobileRest if (isMainPage || json.type == ApiUrlType.VE) { const html = json.type === ApiUrlType.DesktopRest ? json.data : renderDesktopArticle(json, articleId, articleDetail, isMainPage) @@ -88,11 +100,24 @@ const injectHeader = (content: string, articleId: string, articleDetail: Article const header = doc.createElement('h1') header.appendChild(doc.createTextNode(articleDetail.title)) header.classList.add('article-header') - const target = doc.querySelector('body.mw-body-content') + const target = doc.querySelector('body') target.insertAdjacentElement('afterbegin', header) return doc.documentElement.outerHTML } +export const renderParsoidArticle = (wrappedJson: { type: ApiUrlType; data: any }, articleId: string, articleDetail: ArticleDetail, isMainPage = false): string => { + if (!wrappedJson || wrappedJson.type !== ApiUrlType.UseParsoid) { + throw new Error(`Cannot render [${wrappedJson}] into an article`) + } + const json = wrappedJson.data + // Testing if article has been deleted between fetching list and downloading content. + if (json.parse.revid === 0) { + logger.error(DELETED_ARTICLE_ERROR) + throw new Error(DELETED_ARTICLE_ERROR) + } + return injectHeader(json.parse.text, articleId, articleDetail) +} + export const renderDesktopArticle = (wrappedJson: { type: ApiUrlType; data: any }, articleId: string, articleDetail: ArticleDetail, isMainPage = false): string => { if (!wrappedJson || !wrappedJson.type) { throw new Error(`Cannot render [${wrappedJson}] into an article`) diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index f0f08b18..3651ca32 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -10,7 +10,7 @@ import DU from '../DOMUtils.js' import * as domino from 'domino' import { Dump } from '../Dump.js' import Timer from './Timer.js' -import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath } from './index.js' +import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath, ApiUrlType } from './index.js' import { config } from '../config.js' import { footerTemplate, htmlTemplateCode } from '../Templates.js' import { @@ -278,17 +278,19 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade const promises: [string, Promise][] = [] try { + // When the UseParsoid api is available, this call could return the + // parsoid HTML in the same API invocation that gets the modules, + // but we're not going to make that optimization quite yet. + const _moduleDependencies = await getModuleDependencies(articleDetail.title, mw, downloader, dump) const rets = await downloader.getArticle(articleId, dump, articleDetailXId, articleDetail) for (const { articleId, displayTitle: articleTitle, html: articleHtml } of rets) { - const nonPaginatedArticleId = articleDetail.title if (!articleHtml) { logger.warn(`No HTML returned for article [${articleId}], skipping`) continue } curStage += 1 - const _moduleDependencies = await getModuleDependencies(nonPaginatedArticleId, mw, downloader) for (const dep of _moduleDependencies.jsDependenciesList) { jsModuleDependencies.add(dep) } @@ -377,14 +379,15 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade } } -export async function getModuleDependencies(articleId: string, mw: MediaWiki, downloader: Downloader) { +export async function getModuleDependencies(articleId: string, mw: MediaWiki, downloader: Downloader, dump: Dump) { /* These vars will store the list of js and css dependencies for the article we are downloading. */ let jsConfigVars = '' let jsDependenciesList: string[] = [] let styleDependenciesList: string[] = [] - const articleApiUrl = mw.articleApiUrl(articleId) + const downloadApiUrlType = downloader.getArticleApiUrl(articleId, dump).type + const articleApiUrl = mw.articleApiUrl(articleId, downloadApiUrlType === ApiUrlType.UseParsoid) const articleData = await downloader.getJSON(articleApiUrl) diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 44af664b..4342f374 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -355,9 +355,10 @@ describe('saveArticles', () => { }) test('Load inline js from HTML', async () => { - const { downloader, mw } = await setupScrapeClasses() // en wikipedia + const { downloader, mw, dump } = await setupScrapeClasses() // en wikipedia + await downloader.setBaseUrls() - const _moduleDependencies = await getModuleDependencies('Potato', mw, downloader) + const _moduleDependencies = await getModuleDependencies('Potato', mw, downloader, dump) // next variables declared to avoid "variable is not defined" errors let RLCONF: any // eslint-disable-next-line @typescript-eslint/no-unused-vars