Skip to content

Commit

Permalink
Adds JSON-LD functions; Merge pull request #50 from ethanlee16/json-ld
Browse files Browse the repository at this point in the history
Scrapes JSON-LD script tags from HTML. Closes issue #39.

Bug:T148837
  • Loading branch information
mvolz authored Dec 8, 2016
2 parents e8c018d + 6b1ae4c commit 9b9d642
Show file tree
Hide file tree
Showing 12 changed files with 322 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ html-metadata

# MetaData html scraper and parser for Node.js (supports Promises and callback style)

The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, JSON-LD, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).

Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!

Expand Down
11 changes: 11 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,17 @@ exports.parseHighwirePress = function(chtml, callback){
return index.parseHighwirePress(chtml).nodeify(callback);
};

/**
* Retrieves JSON-LD for given html object
*
* @param {Object} chtml html Cheerio object
* @param {Function} [callback] optional callback function
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = function(chtml, callback){
return index.parseJsonLd(chtml).nodeify(callback);
};

/**
* Scrapes OpenGraph data given html object
*
Expand Down
28 changes: 28 additions & 0 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,33 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){

});

/**
* Returns JSON-LD provided by page given HTML object
* @param {Object} chtml html Cheerio object
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = BBPromise.method(function(chtml) {
var json = [];
var jsonLd = chtml('script[type="application/ld+json"]');

jsonLd.each(function() {
var contents = chtml(this).text().trim();
try {
contents = JSON.parse(contents);
} catch (e) {
// Fail silently, just in case there are valid tags
return;
}
json.push(contents);
});

if (json.length === 0) {
throw new Error("No JSON-LD valid script tags present on page");
}

return json.length > 1 ? json : json[0];
});

/**
* Scrapes OpenGraph data given html object
* @param {Object} chtml html Cheerio object
Expand Down Expand Up @@ -555,6 +582,7 @@ exports.metadataFunctions = {
'eprints': exports.parseEprints,
'general': exports.parseGeneral,
'highwirePress': exports.parseHighwirePress,
'jsonLd': exports.parseJsonLd,
'openGraph': exports.parseOpenGraph,
'schemaOrg': exports.parseSchemaOrgMicrodata,
'twitter': exports.parseTwitter
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-metadata",
"version": "1.5.0",
"version": "1.6.0",
"description": "Scrapes metadata of several different standards",
"main": "index.js",
"dependencies": {
Expand Down
16 changes: 16 additions & 0 deletions test/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ var cheerio = require('cheerio');
var meta = require('../index');
var preq = require('preq'); // Promisified Request library
var assert = require('./utils/assert.js');
var fs = require('fs');


// mocha defines to avoid JSHint breakage
Expand Down Expand Up @@ -98,6 +99,21 @@ describe('errors', function() {
});
});

it('should not find JSON-LD, reject promise', function() {
var url = 'http://example.com';
return preq.get(url)
.then(function(callRes) {
var $ = cheerio.load(callRes.body);
var prom = meta.parseJsonLd($);
return assert.fails(prom);
});
});

it('should reject promise with malformed JSON-LD', function() {
var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html'));
return assert.fails(meta.parseJsonLd($));
});

//TODO: Add test for lacking general metadata
//TODO: Add test for lacking any metadata

Expand Down
29 changes: 29 additions & 0 deletions test/scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,33 @@ describe('scraping', function() {
});
});

describe('JSON-LD tests (for types of Organizations)', function() {
var urls = ['http://www.theguardian.com/us', 'http://jsonld.com/', 'http://www.apple.com/'];
urls.forEach(function(test) {
describe(test, function() {
it('should return an object or array', function() {
return meta(test)
.then(function(res) {
assert.ok(typeof res.jsonLd === 'object');
});
});

it('should get correct JSON-LD data', function() {
return meta(test)
.then(function(res) {
var result = res.jsonLd;
if (res.jsonLd instanceof Array) {
result = res.jsonLd.filter(function(r) {
return r['@type'] === 'Organization';
})[0];
};
['@context', '@type', 'url', 'logo'].forEach(function(key) {
assert.ok(result.hasOwnProperty(key));
});
});
});
});
});
});

});
8 changes: 4 additions & 4 deletions test/static.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ describe('static files', function() {
var expected;

it('should get correct info from turtle movie file', function() {
expected = {"dublinCore":{"title":"Turtles of the Jungle","creator":"http://www.example.com/turtlelvr","description":"A 2008 film about jungle turtles.","date":"2012-02-04 12:00:00","type":"Image.Moving"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"openGraph":{"locale":"en_US","type":"video.movie","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","url":"http://example.com","site_name":"Awesome Turtle Movies Website","image":[{"url":"http://example.com/turtle.jpg"},{"url":"http://example.com/shell.jpg"}],"tag":["turtle","movie","awesome"],"director":"http://www.example.com/PhilTheTurtle","actor":["http://www.example.com/PatTheTurtle","http://www.example.com/SaminaTheTurtle"],"writer":"http://www.example.com/TinaTheTurtle","release_date":"2015-01-14T19:14:27+00:00","duration":"1000000"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":"@Turtlessssssssss","url":"http://www.example.com/turtles","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","player":{"url":"http://www.example.com/turtles/player","width":"400","height":"400","stream":{"url":"http://www.example.com/turtles/turtle.mp4","content_type":"video/mp4"}}}};
expected = JSON.parse(fs.readFileSync('./test/static/turtle_movie.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_movie.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
});
});

it('should get correct info from turtle article file', function() {
expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}};
expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_article.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
});
});

it('should be case insensitive on Turtle Article file', function() {
expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}};
$ = cheerio.load(fs.readFileSync('./test/static/Turtle_Article.html'));
expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_article_case.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
});
Expand Down
19 changes: 19 additions & 0 deletions test/static/turtle_article.html
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,25 @@

<span class="Z3988" title="ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;rft.jtitle=Journal+of+Psychoceramics&amp;rft.date=2008&amp;rft.volume=5&amp;rft.issue=11&amp;rft.spage=1&amp;rft.epage=3&amp;rft.aufirst=Josiah&amp;rft.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Josiah+Carberry"></span>

<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.turtles.com"
}
</script>
<!-- ignored -->
<script type="application/ld+json">
{
"@id": "https://www.turtles.com/"
"potentialAction" / {
"@type": "ViewAction",
"target": "android-app://com.turtles/"
},
"@type": "WebPage",
"@context": "http://schema.org"
}
</script>

</body>

</html>
130 changes: 130 additions & 0 deletions test/static/turtle_article.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"bePress": {
"series_title": "Turtles",
"author": "Turtle Lvr",
"author_institution": "Mediawiki",
"title": "Turtles are AWESOME!!1",
"date": "2012",
"pdf_url": "http://www.example.com/turtlelvr/pdf",
"abstract_html_url": "http://www.example.com/turtlelvr",
"publisher": "Turtles Society",
"online_date": "2012/02/04"
},
"coins": [{
"ctx_ver": "Z39.88-2004",
"rft_id": "info:doi/http://dx.doi.org/10.5555/12345678",
"rfr_id": "info:sid/crossref.org:search",
"rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
"rft": {
"atitle": "Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory",
"jtitle": "Journal of Psychoceramics",
"date": "2008",
"volume": "5",
"issue": "11",
"spage": "1",
"epage": "3",
"aufirst": "Josiah",
"aulast": "Carberry",
"genre": "article",
"au": ["Josiah Carberry"]
}
}],
"dublinCore": {
"title": "Turtles are AWESOME!!1",
"creator": "http://www.example.com/turtlelvr",
"description": "Exposition on the awesomeness of turtles",
"date": "2012-02-04 12:00:00",
"type": "Text.Article"
},
"general": {
"author": "Turtle Lvr",
"authorlink": "http://examples.com/turtlelvr",
"canonical": "http://example.com/turtles",
"description": "Exposition on the awesomeness of turtles",
"publisher": "https://mediawiki.org",
"robots": "we welcome our robot overlords",
"shortlink": "http://example.com/c",
"title": "Turtles are AWESOME!!1 | Awesome Turtles Website",
"lang": "en"
},
"highwirePress": {
"journal_title": "Turtles",
"issn": "1234-5678",
"doi": "10.1000/123",
"publication_date": "2012-02-04",
"title": "Turtles are AWESOME!!1",
"author": "Turtle Lvr",
"author_institution": "Mediawiki",
"volume": "150",
"issue": "1",
"firstpage": "123",
"lastpage": "456",
"publisher": "Turtles Society",
"abstract": "Exposition on the awesomeness of turtles."
},
"jsonLd": {
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.turtles.com"
},
"openGraph": {
"locale": "en_US",
"type": "article",
"title": "Turtles are AWESOME!!1",
"description": "Exposition on the awesomeness of turtles",
"url": "http://example.com",
"site_name": "Awesome Turtles Website",
"image": [{
"url": "http://example.com/turtle.jpg",
"secure_url": "https://secure.example.com/turtle.jpg",
"type": "image/jpeg",
"width": "400",
"height": "300"
}, {
"url": "http://example.com/shell.jpg",
"width": "200",
"height": "150"
}],
"audio": {
"url": "http://example.com/sound.mp3",
"secure_url": "https://secure.example.com/sound.mp3",
"type": "audio/mpeg"
},
"tag": ["turtles", "are", "awesome"],
"section": ["Turtles are tough", "Turtles are flawless", "Turtles are cute"],
"published_time": "2012-02-04T12:00:00+00:00",
"modified_time": "2015-01-14T19:14:27+00:00",
"author": "http://examples.com/turtlelvr",
"publisher": "http://mediawiki.org"
},
"eprints": {
"title": "Turtles are AWESOME!!1",
"creators_name": "http://www.example.com/turtlelvr",
"abstract": "Exposition on the awesomeness of turtles",
"datestamp": "2012-02-04 12:00:00",
"type": "article"
},
"twitter": {
"card": "summary",
"site": "@Turtlessssssssss",
"creator": ["@Turtlessssssssss", "@Turtlezzzzzzzzzz"],
"url": "http://www.example.com/turtles",
"title": "Turtles are AWESOME!!1",
"description": "Exposition on the awesomeness of turtles",
"image": {
"url": "http://example.com/turtles.jpg",
"alt": "It's a bunch of turtles!"
},
"app": {
"url": {
"iphone": "turtle://",
"googleplay": "turtle://"
},
"id": {
"iphone": "000",
"googleplay": "superturtlearticle.androidapp"
}
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@

<body>

<!-- since keys may be case-sensitive in JSON-LD, take the keys as-is -->
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.turtles.com"
}
</script>

<span class="Z3988" Title="ctx_ver=Z39.88-2004&amp;RFT_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;RFT.aTitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;RFT.jTitle=Journal+of+Psychoceramics&amp;RFT.Date=2008&amp;RFT.Volume=5&amp;RFT.issue=11&amp;RFT.Spage=1&amp;RFT.Epage=3&amp;RFT.Aufirst=Josiah&amp;RFT.Aulast=Carberry&amp;RFT_vAL_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;RFT.Genre=Article&amp;RFT.Au=Josiah+Carberry"></span>

</body>
Expand Down
18 changes: 18 additions & 0 deletions test/static/turtle_article_errors.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<html lang="en">
<head>
<title>Turtles are AWESOME!!1 | Invalid Turtles Website</title>
</head>

<body>

<script type="application/ld+json">
{
"@context" / "http://schema.org",
"@type": "Organization"
"url": "https://www.turtles.com"
}
</script>

</body>

</html>
Loading

0 comments on commit 9b9d642

Please sign in to comment.