From 240f68bf741fe1ce8b7054ac65cabd4c07ed1590 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Mon, 2 Apr 2018 00:17:11 +0200 Subject: [PATCH 01/17] WIP: Check external resources in the main loop as well via asset.load({ metadataOnly: true }) --- lib/index.js | 243 +++++++------------------------------------------- test/index.js | 29 +++--- 2 files changed, 46 insertions(+), 226 deletions(-) diff --git a/lib/index.js b/lib/index.js index 895bf98..d9a1482 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,6 +1,5 @@ const AssetGraph = require('assetgraph'); const async = require('async'); -const request = require('request'); const version = require('../package.json').version; const relationDebugDescription = require('./relationDebugDescription'); const prettyBytes = require('pretty-bytes'); @@ -161,184 +160,6 @@ async function hyperlink( }; } - function httpStatus(asset, attempt = 1) { - const url = asset.url; - const relations = asset._incoming; - - const loadReport = { - operator: 'external-check', - name: `external-check ${url}`, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ), - expected: `200 ${url}` - }; - - return callback => { - if (shouldSkip(loadReport)) { - return setTimeout(callback); - } - - request( - { - method: attempt === 1 ? 'head' : 'get', - url: asset.url, - strictSSL: true, - gzip: true, - headers: { - 'User-Agent': hyperlinkUserAgent, - Accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, sdch, br' - } - }, - (error, res) => { - if (error) { - const code = error.code; - let actual = code || 'Unknown error'; - - switch (code) { - case 'ENOTFOUND': - actual = `DNS missing: ${asset.hostname}`; - break; - case 'HPE_INVALID_CONSTANT': - if (attempt === 1) { - return httpStatus(asset, attempt + 1)(callback); - } - break; - } - - reportTest({ - ...loadReport, - ok: false, - actual - }); - - return callback(); - } - - const status = res.statusCode; - - if (status >= 200 && status < 300) { - const contentType = res.headers['content-type']; - if (contentType && asset.type) { - const matchContentType = contentType.match( - /^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i - ); - if (matchContentType && asset.expectedTypes) { - asset.contentType = matchContentType[1].toLowerCase(); - asset._tryUpgrade(); - } - } else if (!contentType) { - const contentTypeMisingReport = { - ok: false, - name: `content-type-missing ${asset.urlOrDescription}`, - operator: 'content-type-missing', - expected: - asset.contentType || - `A Content-Type compatible with ${asset.type}`, - actual: contentType, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ) - }; - - if (!shouldSkip(contentTypeMisingReport)) { - reportTest(contentTypeMisingReport); - } - } - } - - // Some servers respond weirdly to HEAD requests. Make a second attempt with GET - if (attempt === 1 && status >= 400 && status < 600) { - return httpStatus(asset, attempt + 1)(callback); - } - - // Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down - if (attempt === 2 && status === 502) { - setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000); - return; - } - - const redirects = res.request._redirect.redirects; - if (redirects.length > 0) { - const log = [{ redirectUri: url }, ...redirects].map( - (item, idx, arr) => { - if (arr[idx + 1]) { - item.statusCode = arr[idx + 1].statusCode; - } else { - item.statusCode = 200; - } - - return item; - } - ); - - const redirectReport = { - operator: 'external-redirect', - name: `external-redirect ${url}`, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ), - expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}` - }; - - const actual = log - .map(redirect => `${redirect.statusCode} ${redirect.redirectUri}`) - .join(' --> '); - - if (!shouldSkip(redirectReport)) { - // A single temporary redirect is allowed - if ([302, 307].includes(log[0].statusCode)) { - if (log.length < 3) { - reportTest({ - ...redirectReport, - expected: actual, - actual, - ok: true - }); - } else { - reportTest({ - ...redirectReport, - expected: `${log[0].statusCode} ${url} --> 200 ${ - log[log.length - 1].redirectUri - }`, - actual, - ok: false - }); - } - } else { - reportTest({ - ...redirectReport, - actual, - ok: false - }); - } - } - } - - if (status === 200) { - reportTest({ - ...loadReport, - ok: true, - actual: loadReport.expected - }); - - return callback(); - } - - reportTest({ - ...loadReport, - actual: `${status} ${url}`, - ok: false - }); - - return callback(); - } - ); - }; - } - if (verbose) { ag.on('addRelation', relation => { console.error('addRelation', relation.toString()); @@ -438,9 +259,10 @@ async function hyperlink( async function processAsset(asset) { if (!processedAssets.has(asset)) { processedAssets.add(asset); + const operator = asset._metadataOnly ? 'external-check' : 'load'; const loadReport = { - operator: 'load', - name: `load ${asset.urlOrDescription}`, + operator, + name: `${operator} ${asset.urlOrDescription}`, expected: `200 ${asset.urlOrDescription}` }; @@ -455,7 +277,8 @@ async function hyperlink( } try { - await asset.load(); + // FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode + await asset.load({ metadataOnly: asset._metadataOnly }); reportTest({ ...loadReport, @@ -476,6 +299,20 @@ async function hyperlink( return; } + if (asset.statusCode >= 300 && asset.statusCode < 400) { + // TODO: Warn about chains of temporary redirects + const redirectRelation = asset.outgoingRelations.find( + r => r.type === 'HttpRedirect' + ); + reportTest({ + ok: asset.statusCode !== 301, + operator: 'external-redirect', + name: `external-redirect ${asset.url}`, + at: loadReport.at, + expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}` + }); + } + for (const relation of asset.externalRelations) { // Only do work for supported protocols if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) { @@ -561,8 +398,10 @@ async function hyperlink( } let follow; - - if ( + let metadataOnly = asset._metadataOnly; + if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) { + follow = true; + } else if ( ['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type) ) { follow = false; @@ -582,7 +421,7 @@ async function hyperlink( follow = true; relation.to.stopProcessing = true; } else { - relation.to.check = true; + metadataOnly = true; } } } else if ( @@ -591,19 +430,19 @@ async function hyperlink( if (followSourceMaps) { follow = true; } else { - relation.to.check = true; + metadataOnly = true; } } else if ( ['SourceMapFile', 'SourceMapSource'].includes(relation.type) ) { if (followSourceMaps) { - relation.to.check = true; + metadataOnly = true; } } else { follow = true; } - if (follow) { + if (follow || metadataOnly) { if (assetTypesWithoutRelations.includes(relation.to.type)) { // If we are handling local file-urls, follow but mark as end-of-line in processing if ( @@ -613,15 +452,17 @@ async function hyperlink( relation.to.stopProcessing = !recursive; assetQueue.push(relation.to); } else { - relation.to.check = true; + metadataOnly = true; } } else { assetQueue.push(relation.to); } + relation.to._metadataOnly = metadataOnly; + assetQueue.push(relation.to); } } - if (asset.type === 'Html') { + if (asset.type === 'Html' && !asset._metadataOnly) { // Remember the set of ids in the document before unloading so incoming fragments can be checked: asset.ids = new Set(); for (const element of Array.from( @@ -694,28 +535,6 @@ async function hyperlink( } } - // Check urls - const assetsToCheck = ag - .findAssets({ check: true }) - .filter(asset => !processedAssets.has(asset)); - t.push({ - name: `Crawling ${assetsToCheck.length} outgoing urls` - }); - - await new Promise((resolve, reject) => - async.parallelLimit( - assetsToCheck.map(asset => httpStatus(asset)), - 20, - err => { - if (err) { - reject(err); - } else { - resolve(); - } - } - ) - ); - // Check Content-Type vs. incoming relation targetTypes: for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) { diff --git a/test/index.js b/test/index.js index 8a0bc18..3f2c971 100644 --- a/test/index.js +++ b/test/index.js @@ -119,7 +119,7 @@ describe('hyperlink', function() { name: 'load https://example.com/', ok: true }); - t.push({ name: 'Crawling 2 outgoing urls' }); + // t.push( { name: 'Crawling 2 outgoing urls' } ); t.push(null, { ok: true, name: 'external-check https://google.com' @@ -240,7 +240,7 @@ describe('hyperlink', function() { ok: false, operator: 'content-type-mismatch', name: 'content-type-mismatch https://example.com/hey.png', - actual: 'Asset is used as both Image and Text', + actual: 'Asset is used as both Png and Text', at: 'https://example.com/ (6:25) ' }); }); @@ -286,8 +286,9 @@ describe('hyperlink', function() { expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, - operator: 'content-type-missing', - name: 'content-type-missing https://example.com/hey.png', + operator: 'error', + actual: + 'https://example.com/hey.png: No Content-Type response header received', at: 'https://example.com/ (6:25) ' }); }); @@ -373,9 +374,9 @@ describe('hyperlink', function() { actual: expect.it('to begin with', 'ENOENT: no such file or directory') }); - t.push({ - name: 'Crawling 0 outgoing urls' - }); + // t.push({ + // name: 'Crawling 0 outgoing urls' + // }); t.push({ name: @@ -768,12 +769,12 @@ describe('hyperlink', function() { { request: 'HEAD https://mycdn.com/404.eot', response: 404 - }, - // retry - { - request: 'GET https://mycdn.com/404.eot', - response: 404 - } + } /*, + // retry + { + request: 'GET https://mycdn.com/404.eot', + response: 404 + }*/ ]); const t = new TapRender(); @@ -792,7 +793,7 @@ describe('hyperlink', function() { operator: 'external-check', name: 'external-check https://mycdn.com/404.eot', expected: '200 https://mycdn.com/404.eot', - actual: '404 https://mycdn.com/404.eot' + actual: 'HTTP 404 Not Found' }); }); }); From 44a32e1c779017f43cf0ad68b1d4c67cd0dd6acc Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Tue, 3 Apr 2018 23:27:38 +0200 Subject: [PATCH 02/17] Don't expect the "Crawling X outgoing urls" message https://github.com/Munter/hyperlink/pull/143#issuecomment-377722339 --- test/index.js | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/test/index.js b/test/index.js index 3f2c971..c97300e 100644 --- a/test/index.js +++ b/test/index.js @@ -119,7 +119,6 @@ describe('hyperlink', function() { name: 'load https://example.com/', ok: true }); - // t.push( { name: 'Crawling 2 outgoing urls' } ); t.push(null, { ok: true, name: 'external-check https://google.com' @@ -374,10 +373,6 @@ describe('hyperlink', function() { actual: expect.it('to begin with', 'ENOENT: no such file or directory') }); - // t.push({ - // name: 'Crawling 0 outgoing urls' - // }); - t.push({ name: 'Connecting to 0 hosts (checking ' @@ -410,11 +405,6 @@ describe('hyperlink', function() { skip: 0, todo: 0 }); - expect(t.push, 'to have a call satisfying', () => { - t.push({ - name: 'Crawling 0 outgoing urls' - }); - }); expect(t.push, 'to have no calls satisfying', () => { t.push(null, { operator: 'fragment-check', @@ -449,11 +439,6 @@ describe('hyperlink', function() { skip: 0, todo: 0 }); - expect(t.push, 'to have a call satisfying', () => { - t.push({ - name: 'Crawling 0 outgoing urls' - }); - }); }); }); From bfe14d7b4f2f368550268d31b5e6e29df9816f20 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Tue, 3 Apr 2018 23:31:52 +0200 Subject: [PATCH 03/17] Accept one more Content-Type warning --- test/index.js | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test/index.js b/test/index.js index c97300e..ca6d8a1 100644 --- a/test/index.js +++ b/test/index.js @@ -233,7 +233,7 @@ describe('hyperlink', function() { t ); - expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.close(), 'to satisfy', { fail: 2 }); expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, @@ -243,6 +243,15 @@ describe('hyperlink', function() { at: 'https://example.com/ (6:25) ' }); }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: false, + operator: 'content-type-mismatch', + name: 'content-type-mismatch https://example.com/hey.png', + actual: 'Asset is used as both Image and Text', + at: 'https://example.com/ (6:25) ' + }); + }); }); it('should complain if an asset being HEADed has no Content-Type', async function() { From 1abd830b7ee5760281746f74001086cc38bc8a77 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Tue, 3 Apr 2018 23:51:32 +0200 Subject: [PATCH 04/17] Implement crude retrying --- lib/index.js | 35 ++++++++++++++++++++++++++--------- test/index.js | 13 ++++++------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/lib/index.js b/lib/index.js index d9a1482..b6fbf37 100644 --- a/lib/index.js +++ b/lib/index.js @@ -6,9 +6,7 @@ const prettyBytes = require('pretty-bytes'); const net = require('net'); const tls = require('tls'); -const defaultSkipFilters = [ - require('./known-culprits/linkedin') -]; +const defaultSkipFilters = [require('./known-culprits/linkedin')]; const hyperlinkUserAgent = `Hyperlink v${version} (https://www.npmjs.com/package/hyperlink)`; @@ -285,12 +283,31 @@ async function hyperlink( ok: true }); } catch (err) { - reportTest({ - ...loadReport, - ok: false, - actual: err.message - }); - return; + if ( + asset._metadataOnly && + err.statusCode && + err.statusCode >= 400 && + err.statusCode <= 600 + ) { + try { + asset.keepUnpopulated = true; + await asset.load(); // Trigger a GET + } catch (err) { + reportTest({ + ...loadReport, + ok: false, + actual: err.message + }); + return; + } + } else { + reportTest({ + ...loadReport, + ok: false, + actual: err.message + }); + return; + } } // In non-recursive mode local assets might be marked as end-of-line. diff --git a/test/index.js b/test/index.js index ca6d8a1..fa8d0ee 100644 --- a/test/index.js +++ b/test/index.js @@ -763,12 +763,11 @@ describe('hyperlink', function() { { request: 'HEAD https://mycdn.com/404.eot', response: 404 - } /*, - // retry - { - request: 'GET https://mycdn.com/404.eot', - response: 404 - }*/ + }, + { + request: 'GET https://mycdn.com/404.eot', + response: 404 + } ]); const t = new TapRender(); @@ -1951,7 +1950,7 @@ describe('hyperlink', function() { ok: false, at: 'https://example.com/ (6:25) ', expected: '200 https://example.com/hey.png', - actual: '503 https://example.com/hey.png' + actual: 'HTTP 503 Service Unavailable' }); }); }); From ba9e75dc6c59279cffe0deda7ca3676450058303 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Wed, 4 Apr 2018 00:28:08 +0200 Subject: [PATCH 05/17] Fix test (now an external-check message is emitted for the redirect target as well) --- test/index.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/index.js b/test/index.js index fa8d0ee..bc46a5b 100644 --- a/test/index.js +++ b/test/index.js @@ -924,10 +924,9 @@ describe('hyperlink', function() { }, t ); - expect(t.close(), 'to satisfy', { - count: 3, - pass: 3, + count: 4, + pass: 4, fail: 0, skip: 0, todo: 0 From f489f19090c05d38cce3ad39d1919dab5cf11acb Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Wed, 4 Apr 2018 00:33:36 +0200 Subject: [PATCH 06/17] Fix typo --- lib/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/index.js b/lib/index.js index b6fbf37..9f1cb8b 100644 --- a/lib/index.js +++ b/lib/index.js @@ -326,7 +326,7 @@ async function hyperlink( operator: 'external-redirect', name: `external-redirect ${asset.url}`, at: loadReport.at, - expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}}` + expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}` }); } From a6402acad839914d25360bb5abb7cc2bc1a24239 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Wed, 4 Apr 2018 23:24:06 +0200 Subject: [PATCH 07/17] Fix redirect reporting --- lib/index.js | 45 +++++++++++++++++++++++++++++++++++++-------- test/index.js | 6 ++++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/lib/index.js b/lib/index.js index 9f1cb8b..8864cad 100644 --- a/lib/index.js +++ b/lib/index.js @@ -316,18 +316,13 @@ async function hyperlink( return; } + // Save info for the redirect check later if (asset.statusCode >= 300 && asset.statusCode < 400) { - // TODO: Warn about chains of temporary redirects const redirectRelation = asset.outgoingRelations.find( r => r.type === 'HttpRedirect' ); - reportTest({ - ok: asset.statusCode !== 301, - operator: 'external-redirect', - name: `external-redirect ${asset.url}`, - at: loadReport.at, - expected: `302 ${asset.url} --> 200 ${redirectRelation.to.url}` - }); + asset._redirectRelation = redirectRelation; + redirectRelation.to._hasIncomingRedirect = true; } for (const relation of asset.externalRelations) { @@ -552,6 +547,40 @@ async function hyperlink( } } + // Check redirects + for (const asset of ag.findAssets({ + _redirectRelation: { $exists: true }, + _hasIncomingRedirect: { $ne: true } + })) { + const redirectChain = [asset]; + let cursor = asset; + while (cursor._redirectRelation) { + cursor = cursor._redirectRelation.to; + redirectChain.push(cursor); + } + let at; + if (asset._incoming && asset._incoming[0].debugDescription) { + at = asset._incoming[0].debugDescription; + } else { + at = `${asset.urlOrDescription} (input URL)`; + } + + reportTest({ + ok: + redirectChain.length <= 3 && + redirectChain + .slice(0, redirectChain.length - 1) + .every(asset => asset.statusCode !== 301), + operator: 'external-redirect', + name: `external-redirect ${asset.url}`, + at, + expected: `302 ${asset.url} --> 200 ${cursor.url}`, + actual: redirectChain + .map(asset => `${asset.statusCode} ${asset.url}`) + .join(' --> ') + }); + } + // Check Content-Type vs. incoming relation targetTypes: for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) { diff --git a/test/index.js b/test/index.js index bc46a5b..e30bda7 100644 --- a/test/index.js +++ b/test/index.js @@ -992,8 +992,8 @@ describe('hyperlink', function() { ); expect(t.close(), 'to satisfy', { - count: 3, - pass: 2, + count: 5, + pass: 4, fail: 1, skip: 0, todo: 0 @@ -1001,6 +1001,8 @@ describe('hyperlink', function() { expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, + at: + 'https://example.com/ (1:35) ...', name: 'external-redirect https://elsewhere.com/', expected: '302 https://elsewhere.com/ --> 200 https://elsewhere.com/finalDestination', From 6d75a15e216f53521d40bf7d3ff494fbbe4344ca Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Wed, 4 Apr 2018 23:46:54 +0200 Subject: [PATCH 08/17] Carry over parts of the old redirect check logic --- lib/index.js | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/lib/index.js b/lib/index.js index 8864cad..025355d 100644 --- a/lib/index.js +++ b/lib/index.js @@ -565,20 +565,47 @@ async function hyperlink( at = `${asset.urlOrDescription} (input URL)`; } - reportTest({ - ok: - redirectChain.length <= 3 && - redirectChain - .slice(0, redirectChain.length - 1) - .every(asset => asset.statusCode !== 301), + const redirectReport = { operator: 'external-redirect', name: `external-redirect ${asset.url}`, at, - expected: `302 ${asset.url} --> 200 ${cursor.url}`, - actual: redirectChain - .map(asset => `${asset.statusCode} ${asset.url}`) - .join(' --> ') - }); + expected: `302 ${asset.url} --> 200 ${ + redirectChain[redirectChain.length - 1].url + }` + }; + + const actual = redirectChain + .map(asset => `${asset.statusCode} ${asset.url}`) + .join(' --> '); + + if (!shouldSkip(redirectReport)) { + // A single temporary redirect is allowed + if ([302, 307].includes(redirectChain[0].statusCode)) { + if (redirectChain.length < 3) { + reportTest({ + ...redirectReport, + expected: actual, + actual, + ok: true + }); + } else { + reportTest({ + ...redirectReport, + expected: `${redirectChain[0].statusCode} ${asset.url} --> 200 ${ + redirectChain[redirectChain.length - 1].url + }`, + actual, + ok: false + }); + } + } else { + reportTest({ + ...redirectReport, + actual, + ok: false + }); + } + } } // Check Content-Type vs. incoming relation targetTypes: From d03de8ba30654208aea3adbcbfea5906a4186101 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 6 Apr 2018 00:15:12 +0200 Subject: [PATCH 09/17] Expect one more failing external-redirect test in an unrelated test Instead of push( null, { ok: true, operator: 'external-redirect', name: 'external-redirect https://elsewhere.com/', at: 'https://example.com/ (1:39) ', expected: '302 https://elsewhere.com/ --> 200 http://elsewhere.com/redirectTarget' } ); at reportTest (lib/index.js:104:9) push( null, { ok: true, operator: 'external-redirect', name: 'external-redirect http://elsewhere.com/redirectTarget', at: 'https://elsewhere.com/', expected: '302 http://elsewhere.com/redirectTarget --> 200 https://elsewhere.com/redirectTarget' } ); at reportTest (lib/index.js:104:9) we now get push( null, { ok: false, operator: 'external-redirect', name: 'external-redirect https://elsewhere.com/', at: 'https://example.com/ (1:39) ', expected: '302 https://elsewhere.com/ --> 200 https://elsewhere.com/redirectTarget', actual: '302 https://elsewhere.com/ --> 302 http://elsewhere.com/redirectTarget --> 200 https://elsewhere.com/redirectTarget' } ); at reportTest (lib/index.js:114:7) which is correct --- test/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/index.js b/test/index.js index e30bda7..709fba9 100644 --- a/test/index.js +++ b/test/index.js @@ -1065,7 +1065,7 @@ describe('hyperlink', function() { t ); - expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.close(), 'to satisfy', { fail: 2 }); expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, From e95412f837f6bde0ae048db547496452e0875cfd Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 6 Apr 2018 00:30:31 +0200 Subject: [PATCH 10/17] Simplify mixed content report generation --- lib/index.js | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/lib/index.js b/lib/index.js index 025355d..f4660b8 100644 --- a/lib/index.js +++ b/lib/index.js @@ -395,17 +395,10 @@ async function hyperlink( }; if (!shouldSkip(mixedContentReport)) { - if (mixedContentReport.actual !== mixedContentReport.expected) { - reportTest({ - ...mixedContentReport, - ok: false - }); - } else { - reportTest({ - ...mixedContentReport, - ok: true - }); - } + reportTest({ + ...mixedContentReport, + ok: mixedContentReport.actual === mixedContentReport.expected + }); } } From 55ec85649e4310d5b8daa7c17fe4087f4adcd6d9 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 6 Apr 2018 00:47:07 +0200 Subject: [PATCH 11/17] Make sure that we GET an asset that was previously only HEADed now that a new relation came about --- lib/index.js | 6 ++++- test/index.js | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/lib/index.js b/lib/index.js index f4660b8..232e145 100644 --- a/lib/index.js +++ b/lib/index.js @@ -275,7 +275,6 @@ async function hyperlink( } try { - // FIXME: Make sure we do a full load if an asset is added to the queue again in non-metadataOnly mode await asset.load({ metadataOnly: asset._metadataOnly }); reportTest({ @@ -462,6 +461,11 @@ async function hyperlink( } else { assetQueue.push(relation.to); } + if (relation.to._metadataOnly && !metadataOnly) { + // Make sure that we GET an asset that was previously only HEADed + // now that a new relation came about + processedAssets.delete(relation.to); + } relation.to._metadataOnly = metadataOnly; assetQueue.push(relation.to); } diff --git a/test/index.js b/test/index.js index 709fba9..32be33f 100644 --- a/test/index.js +++ b/test/index.js @@ -1955,4 +1955,66 @@ describe('hyperlink', function() { }); }); }); + + it('should GET an asset that was previously HEADed if new, "non-external" relations show up', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: ` + + + + + + + + + ` + } + }, + { + request: 'HEAD https://example.com/otherpage.html', + response: { + headers: { + 'Content-Type': 'text/html' + } + } + }, + { + request: 'GET https://example.com/script.js', + response: { + headers: { + 'Content-Type': 'application/javascript' + }, + body: 'alert("Hello " + "/otherpage.html".toString("url"));' + } + }, + { + request: 'GET https://example.com/otherpage.html', + response: { + headers: { + 'Content-Type': 'text/html' + }, + body: '

Hello, world!

' + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 0 }); + }); }); From fbaa61bf010405488b76a656274644c502cac7e0 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 6 Apr 2018 00:55:43 +0200 Subject: [PATCH 12/17] Remove the special casing of "external" links pointing at file: (via stopProcessing) This is now handled in assetgraph by having asset.load({metadataOnly: true}) issue an fs.stat call. https://github.com/Munter/hyperlink/pull/143#discussion_r178435593 --- lib/index.js | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/lib/index.js b/lib/index.js index 232e145..1b79d84 100644 --- a/lib/index.js +++ b/lib/index.js @@ -309,12 +309,6 @@ async function hyperlink( } } - // In non-recursive mode local assets might be marked as end-of-line. - // This is specifically relevant to local file-URLs - if (asset.stopProcessing) { - return; - } - // Save info for the redirect check later if (asset.statusCode >= 300 && asset.statusCode < 400) { const redirectRelation = asset.outgoingRelations.find( @@ -416,17 +410,7 @@ async function hyperlink( if (!relation.crossorigin && recursive) { follow = true; } else if (relation.from !== relation.to) { - // If we are handling local file-urls, follow but mark as end-of-line in processing - if ( - !recursive && - relation.from.protocol === 'file:' && - relation.to.protocol === 'file:' - ) { - follow = true; - relation.to.stopProcessing = true; - } else { - metadataOnly = true; - } + metadataOnly = true; } } else if ( /^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type) @@ -448,16 +432,7 @@ async function hyperlink( if (follow || metadataOnly) { if (assetTypesWithoutRelations.includes(relation.to.type)) { - // If we are handling local file-urls, follow but mark as end-of-line in processing - if ( - relation.from.protocol === 'file:' && - relation.to.protocol === 'file:' - ) { - relation.to.stopProcessing = !recursive; - assetQueue.push(relation.to); - } else { - metadataOnly = true; - } + metadataOnly = true; } else { assetQueue.push(relation.to); } From 02f31a51a12ebe56d953627d3fb55641c8c4196e Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 6 Apr 2018 01:04:42 +0200 Subject: [PATCH 13/17] Reinstate a specific content-type-missing operator so it can be filtered https://github.com/Munter/hyperlink/pull/143#discussion_r178435652 --- lib/index.js | 13 +++++++++---- test/index.js | 6 +++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/lib/index.js b/lib/index.js index 1b79d84..bd43bdf 100644 --- a/lib/index.js +++ b/lib/index.js @@ -168,16 +168,22 @@ async function hyperlink( } function handleError(error) { - // Explicitly handle incompatible types warning + // Detect and upgrade certain errors from AssetGraph: + let operator; if (error.stack && error.stack.includes('_warnIncompatibleTypes')) { + operator = 'content-type-mismatch'; + } else if (error.message === 'No Content-Type response header received') { + operator = 'content-type-missing'; + } + if (operator) { const asset = error.asset; const expected = asset.contentType || `A Content-Type compatible with ${asset.type}`; const contentTypeMismatchReport = { ok: false, - operator: 'content-type-mismatch', - name: `content-type-mismatch ${asset.urlOrDescription}`, + operator, + name: `${operator} ${asset.urlOrDescription}`, expected, actual: error.message, at: [...new Set(asset._incoming.map(r => r.debugDescription))].join( @@ -191,7 +197,6 @@ async function hyperlink( return; } - const message = error.message || error; const asset = error.asset || (error.relation && error.relation.to); const report = { diff --git a/test/index.js b/test/index.js index 32be33f..ba80a38 100644 --- a/test/index.js +++ b/test/index.js @@ -294,9 +294,9 @@ describe('hyperlink', function() { expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, - operator: 'error', - actual: - 'https://example.com/hey.png: No Content-Type response header received', + operator: 'content-type-missing', + name: 'content-type-missing https://example.com/hey.png', + actual: 'No Content-Type response header received', at: 'https://example.com/ (6:25) ' }); }); From ae8d47d05180df91545e37e27295f82c4fc18053 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 20 Apr 2018 00:40:59 +0200 Subject: [PATCH 14/17] Handle redirect cycles --- lib/index.js | 53 ++++++++++++++++++++++++++++++++++++++++---------- test/index.js | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 10 deletions(-) diff --git a/lib/index.js b/lib/index.js index bd43bdf..14521f3 100644 --- a/lib/index.js +++ b/lib/index.js @@ -525,14 +525,17 @@ async function hyperlink( } // Check redirects - for (const asset of ag.findAssets({ - _redirectRelation: { $exists: true }, - _hasIncomingRedirect: { $ne: true } - })) { + + function checkRedirectChainFrom(asset, isCycle) { const redirectChain = [asset]; let cursor = asset; - while (cursor._redirectRelation) { + cursor._processedRedirect = true; + while ( + cursor._redirectRelation && + !redirectChain.includes(cursor._redirectRelation.to) + ) { cursor = cursor._redirectRelation.to; + cursor._processedRedirect = true; redirectChain.push(cursor); } let at; @@ -551,13 +554,23 @@ async function hyperlink( }` }; - const actual = redirectChain - .map(asset => `${asset.statusCode} ${asset.url}`) - .join(' --> '); - if (!shouldSkip(redirectReport)) { // A single temporary redirect is allowed - if ([302, 307].includes(redirectChain[0].statusCode)) { + if (isCycle) { + redirectChain.push(cursor._redirectRelation.to); + } + const actual = redirectChain + .map(asset => `${asset.statusCode} ${asset.url}`) + .join(' --> '); + + if (isCycle) { + reportTest({ + ...redirectReport, + operator: 'redirect-cycle', + actual, + ok: false + }); + } else if ([302, 307].includes(redirectChain[0].statusCode)) { if (redirectChain.length < 3) { reportTest({ ...redirectReport, @@ -585,6 +598,26 @@ async function hyperlink( } } + for (const asset of ag.findAssets({ + _redirectRelation: { $exists: true }, + _hasIncomingRedirect: { $ne: true } + })) { + checkRedirectChainFrom(asset); + } + + // The redirects without _processedRedirect:true at this + // point participate in at least one cycle: + for (const asset of ag + .findAssets({ + _redirectRelation: { $exists: true }, + _processedRedirect: { $ne: true } + }) + .sort((a, b) => parseInt(a.id) - parseInt(b.id))) { + if (!asset._processedRedirect) { + checkRedirectChainFrom(asset, true); + } + } + // Check Content-Type vs. incoming relation targetTypes: for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) { diff --git a/test/index.js b/test/index.js index ba80a38..1d1b666 100644 --- a/test/index.js +++ b/test/index.js @@ -1077,6 +1077,60 @@ describe('hyperlink', function() { }); }); }); + + it('should report a redirect cycle', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + '' + } + }, + { + request: 'GET https://elsewhere.com/', + response: { + statusCode: 302, + headers: { + Location: 'https://elsewhere.com/redirectTarget' + } + } + }, + { + request: 'GET https://elsewhere.com/redirectTarget', + response: { + statusCode: 302, + headers: { + Location: 'https://elsewhere.com/' + } + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: false, + operator: 'redirect-cycle', + actual: + '302 https://elsewhere.com/ --> 302 https://elsewhere.com/redirectTarget --> 302 https://elsewhere.com/' + }); + }); + }); }); describe('with a preconnect link', function() { From 232c6b177d6877dad4a085298fd7f66c6ae4e4be Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 20 Apr 2018 00:47:41 +0200 Subject: [PATCH 15/17] Drop keepUnpopulated, just live with a bit of parsing in that rare case --- lib/index.js | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/index.js b/lib/index.js index 14521f3..15520b3 100644 --- a/lib/index.js +++ b/lib/index.js @@ -294,7 +294,6 @@ async function hyperlink( err.statusCode <= 600 ) { try { - asset.keepUnpopulated = true; await asset.load(); // Trigger a GET } catch (err) { reportTest({ From 1e0f202b902730279d05bd481ff0f8fa8501c836 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Fri, 20 Apr 2018 22:38:47 +0200 Subject: [PATCH 16/17] Don't try to compute relationDebugDescription for relations from assets that have only had their metadata loaded --- lib/relationDebugDescription.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/relationDebugDescription.js b/lib/relationDebugDescription.js index 1f1dc10..f46514c 100644 --- a/lib/relationDebugDescription.js +++ b/lib/relationDebugDescription.js @@ -7,7 +7,7 @@ module.exports = function relationDebugDescription(relation) { var asset = relation.from.nonInlineAncestor; - if (asset.isText) { + if (asset.isText && asset.isLoaded) { var text = asset.rawSrc.toString(); var linesBefore = text.split(relation.href)[0].split('\n'); var charsBefore = linesBefore[linesBefore.length - 1]; From 47d597197b2946802af9988e2f6c9e0eaa808e5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20M=C3=BCller?= Date: Mon, 23 Jul 2018 16:20:45 +0200 Subject: [PATCH 17/17] Implement loading of external assets to allow a fragment check even though the asset would normally not be populated. Same as #147 --- lib/index.js | 28 ++++++++++++++++++---------- test/index.js | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/lib/index.js b/lib/index.js index 15520b3..0374646 100644 --- a/lib/index.js +++ b/lib/index.js @@ -313,6 +313,21 @@ async function hyperlink( } } + if (asset.type === 'Html' && !asset._metadataOnly) { + // Remember the set of ids in the document before unloading so incoming fragments can be checked: + asset.ids = new Set(); + for (const element of Array.from( + asset.parseTree.querySelectorAll('[id]') + )) { + asset.ids.add(element.getAttribute('id')); + } + } + + if (asset.stopProcessing) { + asset.unload(); + return; + } + // Save info for the redirect check later if (asset.statusCode >= 300 && asset.statusCode < 400) { const redirectRelation = asset.outgoingRelations.find( @@ -413,6 +428,9 @@ async function hyperlink( ) { if (!relation.crossorigin && recursive) { follow = true; + } else if (relation.fragment && relation.fragment !== '#') { + follow = true; + relation.to.stopProcessing = true; } else if (relation.from !== relation.to) { metadataOnly = true; } @@ -450,16 +468,6 @@ async function hyperlink( } } - if (asset.type === 'Html' && !asset._metadataOnly) { - // Remember the set of ids in the document before unloading so incoming fragments can be checked: - asset.ids = new Set(); - for (const element of Array.from( - asset.parseTree.querySelectorAll('[id]') - )) { - asset.ids.add(element.getAttribute('id')); - } - } - // Conserve memory by immediately unloading the asset: if (verbose) { reportTest({ diff --git a/test/index.js b/test/index.js index 1d1b666..b47918c 100644 --- a/test/index.js +++ b/test/index.js @@ -617,6 +617,55 @@ describe('hyperlink', function() { }); }); + it('should not issue an error when referencing an external asset with an existing fragment', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + '
Link' + } + }, + { + request: 'GET https://test.external.tools/foo.html', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + '
I exist
' + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + recursive: true, + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 0 }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: true, + operator: 'fragment-check', + name: + 'fragment-check https://example.com/ --> https://test.external.tools/foo.html#frag', + expected: 'id="frag"' + }); + }); + }); + it('should be fine when an asset references itself with an empty fragment', async function() { httpception([ {