Skip to content

Commit 8dd2418

Browse files
feat: Tooling to find dead links
1 parent 74881ff commit 8dd2418

7 files changed

+235
-74
lines changed

lib/broken-links.js

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
const fetch = require('node-fetch')
2+
const fsPromises = require('fs').promises
3+
const isUrl = require('is-url')
4+
const path = require('path')
5+
const readdirp = require('readdirp')
6+
const yaml = require('yaml')
7+
8+
const topDir = path.dirname(__dirname)
9+
10+
// walk an object subtree looking for URL strings
11+
const getObjectUrls = (root) => {
12+
const urls = []
13+
const queue = [root]
14+
while (queue.length !== 0) {
15+
const vals = Object.values(queue.shift())
16+
urls.push(...vals.filter(isUrl))
17+
queue.push(...vals.filter((v) => typeof v === 'object'))
18+
}
19+
return urls
20+
}
21+
22+
// scrape a url to see if the link is broken.
23+
// return a Promise that resolves as { url, err }
24+
const scrape = (url) =>
25+
fetch(url, { method: 'HEAD' }) // just scrape headers; body not needed
26+
.then(
27+
(res) => ({
28+
url,
29+
err: res.ok ? null : `${res.status} ${res.statusText}`,
30+
status: res.status,
31+
}),
32+
(err) => ({ url, err, status: -2 })
33+
)
34+
35+
// scrape all the urls found in a yml file.
36+
// report broken links to console.log().
37+
// return a Promise that resolves as an array of broken links: [ { url, err }, ... ]
38+
const processYmlEntry = (entry) =>
39+
fsPromises
40+
.readFile(entry.fullPath, { encoding: 'utf8' })
41+
.then((file) => {
42+
try {
43+
return yaml.parse(file)
44+
} catch (error) {
45+
console.error(`Failed to parse ${entry.path}. Skipping.`)
46+
return { disabled: true }
47+
}
48+
})
49+
.then((o) => (o.disabled ? [] : getObjectUrls(o)))
50+
.then(async (urls) => {
51+
const results = []
52+
53+
for (const url of urls) {
54+
// Scrape one by one to handle rate limiting
55+
const r = await scrape(url)
56+
results.push(r)
57+
}
58+
59+
return results
60+
})
61+
.then((results) => results.filter((res) => !!res.err))
62+
.then((fails) => {
63+
fails.forEach((f) => console.log(`${entry.path} - ${f.url} (${f.err})`))
64+
return fails
65+
})
66+
67+
const findBrokenLinks = (start = 0, end = Infinity) =>
68+
readdirp
69+
.promise(topDir, {
70+
fileFilter: '*.yml',
71+
directoryFilter: (entry) => entry.path.startsWith('apps'),
72+
})
73+
.then(async (entries) => {
74+
const result = []
75+
let limitedEntries = entries
76+
77+
if (start !== 0 || end !== Infinity) {
78+
limitedEntries = entries.slice(start, end)
79+
}
80+
81+
for (const entry of limitedEntries) {
82+
console.log(`Processing ${entry.path}`)
83+
result.push({
84+
entry,
85+
result: await processYmlEntry(entry),
86+
})
87+
}
88+
89+
return result
90+
})
91+
.then((arr) => arr.filter((inner) => !!inner.result.length))
92+
93+
module.exports = findBrokenLinks

package-lock.json

+53-14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+6-4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"license": "MIT",
3636
"devDependencies": {
3737
"@octokit/rest": "^15.15.1",
38+
"@zeit/fetch-retry": "^5.0.0",
3839
"bottleneck": "^1.16.0",
3940
"chai": "^4.2.0",
4041
"check-for-leaks": "^1.2.0",
@@ -46,7 +47,6 @@
4647
"get-image-colors": "^1.8.1",
4748
"github-url-to-object": "^4.0.2",
4849
"human-interval": "^0.1.6",
49-
"lint-staged": "^10.0.9",
5050
"husky": "^4.2.3",
5151
"image-size": "^0.5.0",
5252
"imagemin": "^6.1.0",
@@ -55,21 +55,23 @@
5555
"is-hexcolor": "^1.0.0",
5656
"is-url": "^1.2.2",
5757
"jimp": "^0.3.5",
58+
"js-yaml": "^3.13.1",
59+
"lint-staged": "^10.0.9",
5860
"make-color-accessible": "^1.2.0",
5961
"mkdirp": "^0.5.1",
6062
"mocha": "^5.2.0",
6163
"node-fetch": "^2.6.0",
6264
"npm-run-all": "^4.0.1",
6365
"pick-a-good-color": "^1.1.1",
66+
"prettier": "^2.0.2",
6467
"readdirp": "^3.0.2",
6568
"recursive-readdir-sync": "^1.0.6",
6669
"rimraf": "^2.6.1",
6770
"sharp": "^0.23.0",
6871
"sinon": "^7.2.2",
6972
"slugg": "^1.1.0",
70-
"prettier": "^2.0.2",
71-
"yamljs": "^0.2.8",
72-
"js-yaml": "^3.13.1"
73+
74+
"yaml": "^1.9.2"
7375
},
7476
"engines": {
7577
"node": ">=8"

script/find-broken-links.js

+19-53
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,33 @@
11
#!/usr/bin/env node
22

3+
const findBrokenLinks = require('../lib/broken-links')
4+
35
/* Links can break at any time and it's outside of the repo's control,
46
so it doesn't make sense to run this script as part of CI. Instead,
57
this should be run periodically as part of a separate process. */
68

79
/* TODO: should this do anything corrective as well?
810
e.g. if all the links in a file are dead, disable the file? */
911

10-
const fetch = require('node-fetch')
11-
const fsPromises = require('fs').promises
12-
const isUrl = require('is-url')
13-
const path = require('path')
14-
const process = require('process')
15-
const readdirp = require('readdirp')
16-
const yaml = require('yamljs')
17-
18-
// walk an object subtree looking for URL strings
19-
const getObjectUrls = (root) => {
20-
const urls = []
21-
const queue = [root]
22-
while (queue.length !== 0) {
23-
const vals = Object.values(queue.shift())
24-
urls.push(...vals.filter(isUrl))
25-
queue.push(...vals.filter((v) => typeof v === 'object'))
26-
}
27-
return urls
28-
}
12+
process.on('unhandledRejection', (reason, p) => {
13+
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason)
14+
})
2915

30-
// scrape a url to see if the link is broken.
31-
// return a Promise that resolves as { url, err }
32-
const scrape = (url) =>
33-
fetch(url, { method: 'HEAD' }) // just scrape headers; body not needed
34-
.then(
35-
(res) => ({
36-
url,
37-
err: res.ok ? null : `${res.status} ${res.statusText}`,
38-
}),
39-
(err) => ({ url, err })
40-
)
16+
const numberArgs = process.argv.filter((v) => /^\d+$/.test(v))
17+
const possibleStart =
18+
numberArgs.length > 0 ? parseInt(numberArgs[0], 10) : undefined
19+
const possibleEnd =
20+
numberArgs.length > 0 ? parseInt(numberArgs[1], 10) : undefined
4121

42-
// scrape all the urls found in a yml file.
43-
// report broken links to console.log().
44-
// return a Promise that resolves as an array of broken links: [ { url, err }, ... ]
45-
const processYmlEntry = (entry) =>
46-
fsPromises
47-
.readFile(entry.fullPath, { encoding: 'utf8' })
48-
.then(yaml.parse)
49-
.then((o) => (o.disabled ? [] : getObjectUrls(o)))
50-
.then((urls) => urls.map(scrape))
51-
.then((scrapePromises) => Promise.all(scrapePromises))
52-
.then((results) => results.filter((res) => !!res.err))
53-
.then((fails) => {
54-
fails.forEach((f) => console.log(`${entry.path} - ${f.url} (${f.err})`))
55-
return fails
56-
})
22+
console.log(
23+
`Checking apps ${possibleStart || 0} through ${
24+
possibleEnd || 'infinity'
25+
} for broken links`
26+
)
5727

58-
const topDir = path.dirname(__dirname)
59-
readdirp
60-
.promise(topDir, {
61-
fileFilter: '*.yml',
62-
directoryFilter: (entry) => entry.path.startsWith('apps'),
28+
findBrokenLinks(possibleStart, possibleEnd)
29+
.then((failArrays) => {
30+
console.log(`${failArrays.length} failure groups`)
31+
return failArrays.flat()
6332
})
64-
.then((entries) => entries.map(processYmlEntry))
65-
.then((processPromises) => Promise.all(processPromises))
66-
.then((failArrays) => failArrays.flat())
6733
.then((fails) => process.exit(fails.length))

script/remove-broken-links.js

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env node
2+
3+
const fs = require('fs').promises
4+
5+
const findBrokenLinks = require('../lib/broken-links')
6+
7+
/* Links can break at any time and it's outside of the repo's control,
8+
so it doesn't make sense to run this script as part of CI. Instead,
9+
this should be run periodically as part of a separate process. */
10+
11+
process.on('unhandledRejection', (reason, p) => {
12+
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason)
13+
})
14+
15+
const numberArgs = process.argv.filter((v) => /^\d+$/.test(v))
16+
const possibleStart =
17+
numberArgs.length > 0 ? parseInt(numberArgs[0], 10) : undefined
18+
const possibleEnd =
19+
numberArgs.length > 0 ? parseInt(numberArgs[1], 10) : undefined
20+
21+
console.log(
22+
`Checking apps ${possibleStart || 0} through ${
23+
possibleEnd || 'infinity'
24+
} for broken links`
25+
)
26+
27+
function isRateLimited(failures = []) {
28+
return failures.every(({ status }) => status === 429)
29+
}
30+
31+
async function main() {
32+
const failArrays = (await findBrokenLinks(possibleStart, possibleEnd)).filter(
33+
(failure) => {
34+
return !isRateLimited(failure.result)
35+
}
36+
)
37+
38+
console.log(`Will disable ${failArrays.length} entries`)
39+
40+
for (const failure of failArrays) {
41+
console.timeLog(failure.result)
42+
const deadLinks = failure.result.map(({ url }) => url).join(', ')
43+
let data = await fs.readFile(failure.entry.fullPath, { encoding: 'utf-8' })
44+
45+
if (!data.endsWith('\n')) {
46+
data += `\n`
47+
}
48+
49+
data += `disabled: true # Dead link(s): ${deadLinks}\n`
50+
51+
await fs.writeFile(failure.entry.fullPath, data, { encoding: 'utf-8' })
52+
53+
console.log(data)
54+
console.log(`\n---\n`)
55+
}
56+
}
57+
58+
main()

0 commit comments

Comments
 (0)