script/update-internal-links.js

#!/usr/bin/env node

// [start-readme]
//
// Run this script to find internal links in all content and data Markdown files, check if either the title or link
// (or both) are outdated, and automatically update them if so.
//
// Exceptions:
// * Links with fragments (e.g., [Bar](/foo#bar)) will get their root links updated if necessary, but the fragment
// and title will be unchanged (e.g., [Bar](/noo#bar)).
// * Links with hardcoded versions (e.g., [Foo](/enterprise-server/baz)) will get their root links updated if
// necessary, but the hardcoded versions will be preserved (e.g., [Foo](/enterprise-server/qux)).
// * Links with Liquid in the titles will have their root links updated if necessary, but the titles will be preserved.
//
// [end-readme]

import { fileURLToPath } from 'url'
import path from 'path'
import fs from 'fs'
import cheerio from 'cheerio'
import walk from 'walk-sync'
import { fromMarkdown } from 'mdast-util-from-markdown'
import visit from 'unist-util-visit'
import { loadPages, loadPageMap } from '../lib/page-data.js'
import loadRedirects from '../lib/redirects/precompile.js'
import { getPathWithoutLanguage, getPathWithoutVersion } from '../lib/path-utils.js'
import { allVersionKeys } from '../lib/all-versions.js'
import frontmatter from '../lib/read-frontmatter.js'
import renderContent from '../lib/render-content/index.js'
import patterns from '../lib/patterns.js'
import getRedirect from '../lib/get-redirect.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))

const walkFiles = (pathToWalk) => {
  return walk(path.posix.join(__dirname, '..', pathToWalk), {
    includeBasePath: true,
    directories: false,
  })
    .filter((file) => file.endsWith('.md') && !file.endsWith('README.md'))
    .filter((file) => !file.includes('/early-access/')) // ignore EA for now
}

const allFiles = walkFiles('content').concat(walkFiles('data'))

// The script will throw an error if it finds any markup not represented here.
// Hacky but it captures the current rare edge cases.
const linkInlineMarkup = {
  emphasis: '*',
  strong: '**',
}

const currentVersionWithSpacesRegex = /\/enterprise\/{{ currentVersion }}/g
const currentVersionWithoutSpaces = '/enterprise/{{currentVersion}}'

main()

async function main() {
  console.log('Working...')
  const pageList = await loadPages()
  const pageMap = await loadPageMap(pageList)
  const redirects = await loadRedirects(pageList)

  const context = {
    pages: pageMap,
    redirects,
    currentLanguage: 'en',
  }

  for (const file of allFiles) {
    const { data, content } = frontmatter(fs.readFileSync(file, 'utf8'))
    let newContent = content

    // Do a blanket find-replace for /enterprise/{{ currentVersion }}/ to /enterprise/{{currentVersion}}/
    // so that the AST parser recognizes the link as a link node. The spaces prevent it from doing so.
    newContent = newContent.replace(currentVersionWithSpacesRegex, currentVersionWithoutSpaces)

    const ast = fromMarkdown(newContent)

    // We can't do async functions within visit, so gather the nodes upfront
    const nodesPerFile = []

    visit(ast, (node) => {
      if (node.type !== 'link') return
      if (!node.url.startsWith('/')) return
      if (node.url.startsWith('/assets')) return
      if (node.url.startsWith('/public')) return
      if (node.url.includes('/11.10.340/')) return
      if (node.url.includes('/2.1/')) return
      if (node.url === '/') return

      nodesPerFile.push(node)
    })

    // For every Markdown link...
    for (const node of nodesPerFile) {
      const oldLink = node.url

      // Find and preserve any inline markup in link titles, like [*Foo*](/foo)
      let inlineMarkup = ''
      if (node.children[0].children) {
        inlineMarkup = linkInlineMarkup[node.children[0].type]

        if (!inlineMarkup) {
          console.error(`Cannot find an inline markup entry for ${node.children[0].type}!`)
          process.exit(1)
        }
      }

      const oldTitle = node.children[0].value || node.children[0].children[0].value
      const oldMarkdownLink = `[${inlineMarkup}${oldTitle}${inlineMarkup}](${oldLink})`

      // As a blanket rule, only update titles in links that begin with quotes. (Many links
      // have punctuation before the closing quotes, so we'll only check for opening quotes.)
      // Update: "[Foo](/foo)
      // Do not update: [Bar](/bar)
      const hasQuotesAroundLink = newContent.includes(`"${oldMarkdownLink}`)

      let foundPage, fragmentMatch, versionMatch

      // Run through all supported versions...
      for (const version of allVersionKeys) {
        context.currentVersion = version
        // Render the link for each version using the renderContent pipeline, which includes the rewrite-local-links plugin.
        const html = await renderContent(oldMarkdownLink, context)
        const $ = cheerio.load(html, { xmlMode: true })
        let linkToCheck = $('a').attr('href')

        // We need to preserve fragments and hardcoded versions if any are found.
        fragmentMatch = oldLink.match(/(#.*$)/)
        versionMatch = oldLink.match(/(enterprise-server(?:@.[^/]*?)?)\//)

        // Remove the fragment for now.
        linkToCheck = linkToCheck.replace(/#.*$/, '').replace(patterns.trailingSlash, '$1')

        // Try to find the rendered link in the set of pages!
        foundPage = findPage(linkToCheck, pageMap, redirects)

        // Once a page is found for a particular version, exit immediately; we don't need to check the other versions
        // because all we care about is the page title and path.
        if (foundPage) {
          break
        }
      }

      if (!foundPage) {
        console.error(
          `Can't find link in pageMap! ${oldLink} in ${file.replace(process.cwd(), '')}`
        )
        process.exit(1)
      }

      // If the original link includes a fragment OR the original title includes Liquid, do not change;
      // otherwise, use the found page title. (We don't want to update the title if a fragment is found because
      // the title likely points to the fragment section header, not the page title.)
      const newTitle =
        fragmentMatch || oldTitle.includes('{%') || !hasQuotesAroundLink
          ? oldTitle
          : foundPage.title

      // If the original link includes a fragment, append it to the found page path.
      // Also remove the language code because Markdown links don't include language codes.
      let newLink = getPathWithoutLanguage(
        fragmentMatch ? foundPage.path + fragmentMatch[1] : foundPage.path
      )

      // If the original link includes a hardcoded version, preserve it; otherwise, remove versioning
      // because Markdown links don't include versioning.
      newLink = versionMatch
        ? `/${versionMatch[1]}${getPathWithoutVersion(newLink)}`
        : getPathWithoutVersion(newLink)

      let newMarkdownLink = `[${inlineMarkup}${newTitle}${inlineMarkup}](${newLink})`

      // Handle a few misplaced quotation marks.
      if (oldMarkdownLink.includes('["')) {
        newMarkdownLink = `"${newMarkdownLink}`
      }

      // Stream the results to console as we find them.
      if (oldMarkdownLink !== newMarkdownLink) {
        console.log('old link', oldMarkdownLink)
        console.log('new link', newMarkdownLink)
        console.log('-------')
      }

      newContent = newContent.replace(oldMarkdownLink, newMarkdownLink)
    }

    fs.writeFileSync(file, frontmatter.stringify(newContent, data, { lineWidth: 10000 }))
  }

  console.log('Done!')
}

function findPage(tryPath, pageMap, redirects) {
  if (pageMap[tryPath]) {
    return {
      title: pageMap[tryPath].title,
      path: tryPath,
    }
  }

  const redirect = getRedirect(tryPath, { redirects, pages: pageMap })
  if (pageMap[redirect]) {
    return {
      title: pageMap[redirect].title,
      path: redirect,
    }
  }
}