Skip to content

Commit

Permalink
Add new 'UseParsoid' API type, and prefer it over MCS API
Browse files Browse the repository at this point in the history
  • Loading branch information
cscott authored and kelson42 committed Jul 28, 2023
1 parent d7311f3 commit 0e99380
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 16 deletions.
19 changes: 14 additions & 5 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import imageminWebp from 'imagemin-webp'
import sharp from 'sharp'
import http from 'http'
import https from 'https'
import semver from 'semver'

import {
normalizeMwResponse,
Expand Down Expand Up @@ -78,6 +79,7 @@ export interface MWCapabilities {
coordinatesAvailable: boolean
desktopRestApiAvailable: boolean
mobileRestApiAvailable: boolean
useParsoidApiAvailable: boolean
}

export const defaultStreamRequestOptions: AxiosRequestConfig = {
Expand Down Expand Up @@ -130,6 +132,7 @@ class Downloader {
coordinatesAvailable: true,
desktopRestApiAvailable: false,
mobileRestApiAvailable: false,
useParsoidApiAvailable: false,
}

this.backoffOptions = {
Expand Down Expand Up @@ -216,7 +219,12 @@ class Downloader {
}

public async setBaseUrls() {
if (this.mwCapabilities.desktopRestApiAvailable) {
if (this.mwCapabilities.useParsoidApiAvailable) {
this.baseUrl = {
url: this.mw.useParsoidApiUrl.href,
type: ApiUrlType.UseParsoid,
}
} else if (this.mwCapabilities.desktopRestApiAvailable) {
this.baseUrl = {
url: this.mw.desktopRestApiUrl.href,
type: ApiUrlType.DesktopRest,
Expand All @@ -236,7 +244,7 @@ class Downloader {
// never use it for the main page, but use it for all the other pages.
this.baseUrlForMainPage = this.baseUrl

if (this.mwCapabilities.mobileRestApiAvailable) {
if (this.mwCapabilities.mobileRestApiAvailable && this.baseUrl.type !== ApiUrlType.UseParsoid) {
this.baseUrl = {
url: this.mw.mobileRestApiUrl.href,
type: ApiUrlType.MobileRest,
Expand Down Expand Up @@ -264,6 +272,7 @@ class Downloader {
// accordingly. We need to set a default page (always there because
// installed per default) to request the REST API, otherwise it would
// fail the check.
this.mwCapabilities.useParsoidApiAvailable = semver.satisfies(this.mw.metaData.mwVersion, '>=1.41.0')
this.mwCapabilities.mobileRestApiAvailable = await this.checkApiAvailabilty(this.mw.getMobileRestApiArticleUrl(testArticleId))
this.mwCapabilities.desktopRestApiAvailable = await this.checkApiAvailabilty(this.mw.getDesktopRestApiArticleUrl(testArticleId))
this.mwCapabilities.veApiAvailable = await this.checkApiAvailabilty(this.mw.getVeApiArticleUrl(testArticleId))
Expand Down Expand Up @@ -389,8 +398,7 @@ class Downloader {
}

public async getArticle(articleId: string, dump: Dump, articleDetailXId: RKVS<ArticleDetail>, articleDetail?: ArticleDetail): Promise<RenderedArticle[]> {
const isMainPage = dump.isMainPage(articleId)
const articleApiUrl: ApiUrl = this.getArticleApiUrl(articleId, isMainPage)
const articleApiUrl: ApiUrl = this.getArticleApiUrl(articleId, dump)

logger.info(`Getting article [${articleId}] from ${articleApiUrl.url} with type ${articleApiUrl.type}`)

Expand Down Expand Up @@ -469,7 +477,8 @@ class Downloader {
}
}

private getArticleApiUrl(articleId: string, isMainPage: boolean): ApiUrl {
public getArticleApiUrl(articleId: string, dump: Dump): ApiUrl {
const isMainPage = dump.isMainPage(articleId)
const apiUrl = isMainPage ? this.baseUrlForMainPage : this.baseUrl
return {
url: `${apiUrl.url}${encodeURIComponent(articleId)}`,
Expand Down
12 changes: 9 additions & 3 deletions src/MediaWiki.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class MediaWiki {
public readonly restApiUrl: URL
public readonly mobileRestApiUrl: URL
public readonly desktopRestApiUrl: URL
public readonly useParsoidApiUrl: URL
public readonly getCategories: boolean
public readonly namespaces: MWNamespaces = {}
public readonly namespacesToMirror: string[] = []
Expand All @@ -29,6 +30,7 @@ class MediaWiki {
private readonly apiPath: string
private readonly domain: string
private readonly articleApiUrlBase: string
private readonly articleParsoidApiUrlBase: string

constructor(config: MWConfig) {
this.domain = config.domain || ''
Expand All @@ -49,9 +51,11 @@ class MediaWiki {
this.restApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1', this.baseUrl.href).toString(), '/'))
this.mobileRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/mobile-sections', this.baseUrl.href).toString(), '/'))
this.desktopRestApiUrl = new URL(ensureTrailingChar(new URL(config.restApiPath ?? 'api/rest_v1/page/html', this.baseUrl.href).toString(), '/'))
this.useParsoidApiUrl = new URL(`${this.apiUrl.href}action=parse&format=json&prop=text|revid&parsoid=1&formatversion=2&page=`)

this.modulePath = `${urlParser.resolve(this.baseUrl.href, config.modulePath ?? 'w/load.php')}?`
this.articleApiUrlBase = `${this.apiUrl.href}action=parse&format=json&prop=${encodeURI('modules|jsconfigvars|headhtml')}&page=`
this.articleParsoidApiUrlBase = `${this.apiUrl.href}action=parse&format=json&prop=${encodeURI('modules|jsconfigvars|headhtml|text')}&useparsoid=1&page=`
}

public async login(downloader: Downloader) {
Expand Down Expand Up @@ -101,8 +105,8 @@ class MediaWiki {
return `${this.apiUrl.href}action=query&meta=siteinfo&format=json`
}

public articleApiUrl(articleId: string): string {
return `${this.articleApiUrlBase}${encodeURIComponent(articleId)}`
public articleApiUrl(articleId: string, useParsoid: boolean): string {
return `${useParsoid ? this.articleParsoidApiUrlBase : this.articleApiUrlBase}${encodeURIComponent(articleId)}`
}

public subCategoriesApiUrl(articleId: string, continueStr = '') {
Expand Down Expand Up @@ -278,6 +282,7 @@ class MediaWiki {
siteName,
langIso2,
langIso3,
mwVersion,
}
}

Expand All @@ -297,7 +302,7 @@ class MediaWiki {

const creator = this.getCreatorName() || 'Kiwix'

const [textDir, { langIso2, langIso3, mainPage, siteName }, subTitle] = await Promise.all([
const [textDir, { langIso2, langIso3, mainPage, siteName, mwVersion }, subTitle] = await Promise.all([
this.getTextDirection(downloader),
this.getSiteInfo(downloader),
this.getSubTitle(downloader),
Expand All @@ -320,6 +325,7 @@ class MediaWiki {
subTitle,
creator,
mainPage,
mwVersion,
}

this.metaData = mwMetaData
Expand Down
1 change: 1 addition & 0 deletions src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ interface MWMetaData {
creator: string
mainPage: string
textDir: TextDirection
mwVersion: string

baseUrl: string
wikiPath: string
Expand Down
27 changes: 26 additions & 1 deletion src/util/articleRenderers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { DELETED_ARTICLE_ERROR } from './const.js'

export enum ApiUrlType {
Unknown = 'unknown',
UseParsoid = 'useParsoid',
MobileRest = 'mobileRest',
DesktopRest = 'desktopRest',
VE = 've',
Expand All @@ -29,6 +30,17 @@ export const renderArticle = async (
const articleDetail = articleDetailIn || (await articleDetailXId.get(articleId))
const isMainPage = dump.isMainPage(articleId)

if (json.type == ApiUrlType.UseParsoid) {
const html = renderParsoidArticle(json, articleId, articleDetail, isMainPage)
return [
{
articleId,
displayTitle: articleId.replace('_', ' '),
html,
},
]
}

// Main Page is never ApiUrlType.MobileRest
if (isMainPage || json.type == ApiUrlType.VE) {
const html = json.type === ApiUrlType.DesktopRest ? json.data : renderDesktopArticle(json, articleId, articleDetail, isMainPage)
Expand Down Expand Up @@ -88,11 +100,24 @@ const injectHeader = (content: string, articleId: string, articleDetail: Article
const header = doc.createElement('h1')
header.appendChild(doc.createTextNode(articleDetail.title))
header.classList.add('article-header')
const target = doc.querySelector('body.mw-body-content')
const target = doc.querySelector('body')
target.insertAdjacentElement('afterbegin', header)
return doc.documentElement.outerHTML
}

export const renderParsoidArticle = (wrappedJson: { type: ApiUrlType; data: any }, articleId: string, articleDetail: ArticleDetail, isMainPage = false): string => {
if (!wrappedJson || wrappedJson.type !== ApiUrlType.UseParsoid) {
throw new Error(`Cannot render [${wrappedJson}] into an article`)
}
const json = wrappedJson.data
// Testing if article has been deleted between fetching list and downloading content.
if (json.parse.revid === 0) {
logger.error(DELETED_ARTICLE_ERROR)
throw new Error(DELETED_ARTICLE_ERROR)
}
return injectHeader(json.parse.text, articleId, articleDetail)
}

export const renderDesktopArticle = (wrappedJson: { type: ApiUrlType; data: any }, articleId: string, articleDetail: ArticleDetail, isMainPage = false): string => {
if (!wrappedJson || !wrappedJson.type) {
throw new Error(`Cannot render [${wrappedJson}] into an article`)
Expand Down
13 changes: 8 additions & 5 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import DU from '../DOMUtils.js'
import * as domino from 'domino'
import { Dump } from '../Dump.js'
import Timer from './Timer.js'
import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath } from './index.js'
import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath, ApiUrlType } from './index.js'
import { config } from '../config.js'
import { footerTemplate, htmlTemplateCode } from '../Templates.js'
import {
Expand Down Expand Up @@ -278,17 +278,19 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade
const promises: [string, Promise<Error>][] = []

try {
// When the UseParsoid api is available, this call could return the
// parsoid HTML in the same API invocation that gets the modules,
// but we're not going to make that optimization quite yet.
const _moduleDependencies = await getModuleDependencies(articleDetail.title, mw, downloader, dump)
const rets = await downloader.getArticle(articleId, dump, articleDetailXId, articleDetail)

for (const { articleId, displayTitle: articleTitle, html: articleHtml } of rets) {
const nonPaginatedArticleId = articleDetail.title
if (!articleHtml) {
logger.warn(`No HTML returned for article [${articleId}], skipping`)
continue
}

curStage += 1
const _moduleDependencies = await getModuleDependencies(nonPaginatedArticleId, mw, downloader)
for (const dep of _moduleDependencies.jsDependenciesList) {
jsModuleDependencies.add(dep)
}
Expand Down Expand Up @@ -377,14 +379,15 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade
}
}

export async function getModuleDependencies(articleId: string, mw: MediaWiki, downloader: Downloader) {
export async function getModuleDependencies(articleId: string, mw: MediaWiki, downloader: Downloader, dump: Dump) {
/* These vars will store the list of js and css dependencies for
the article we are downloading. */
let jsConfigVars = ''
let jsDependenciesList: string[] = []
let styleDependenciesList: string[] = []

const articleApiUrl = mw.articleApiUrl(articleId)
const downloadApiUrlType = downloader.getArticleApiUrl(articleId, dump).type
const articleApiUrl = mw.articleApiUrl(articleId, downloadApiUrlType === ApiUrlType.UseParsoid)

const articleData = await downloader.getJSON<any>(articleApiUrl)

Expand Down
5 changes: 3 additions & 2 deletions test/unit/saveArticles.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,10 @@ describe('saveArticles', () => {
})

test('Load inline js from HTML', async () => {
const { downloader, mw } = await setupScrapeClasses() // en wikipedia
const { downloader, mw, dump } = await setupScrapeClasses() // en wikipedia
await downloader.setBaseUrls()

const _moduleDependencies = await getModuleDependencies('Potato', mw, downloader)
const _moduleDependencies = await getModuleDependencies('Potato', mw, downloader, dump)
// next variables declared to avoid "variable is not defined" errors
let RLCONF: any
// eslint-disable-next-line @typescript-eslint/no-unused-vars
Expand Down

0 comments on commit 0e99380

Please sign in to comment.