diff --git a/README.md b/README.md index 1e9b8fb..7236e6f 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ html-metadata # MetaData html scraper and parser for Node.js (supports Promises and callback style) -The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags). +The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, JSON-LD, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags). Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome! diff --git a/index.js b/index.js index 84dd8bf..db29fa4 100644 --- a/index.js +++ b/index.js @@ -121,6 +121,17 @@ exports.parseHighwirePress = function(chtml, callback){ return index.parseHighwirePress(chtml).nodeify(callback); }; +/** + * Retrieves JSON-LD for given html object + * + * @param {Object} chtml html Cheerio object + * @param {Function} [callback] optional callback function + * @return {Object} BBPromise for JSON-LD + */ +exports.parseJsonLd = function(chtml, callback){ + return index.parseJsonLd(chtml).nodeify(callback); +}; + /** * Scrapes OpenGraph data given html object * diff --git a/lib/index.js b/lib/index.js index 33d7254..823d034 100644 --- a/lib/index.js +++ b/lib/index.js @@ -338,6 +338,33 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){ }); +/** + * Returns JSON-LD provided by page given HTML object + * @param {Object} chtml html Cheerio object + * @return {Object} BBPromise for JSON-LD + */ +exports.parseJsonLd = BBPromise.method(function(chtml) { + var json = []; + var jsonLd = chtml('script[type="application/ld+json"]'); + + jsonLd.each(function() { + var contents = chtml(this).text().trim(); + try { + contents = JSON.parse(contents); + } catch (e) { + // Fail silently, just in case there are valid tags + return; + } + json.push(contents); + }); + + if (json.length === 0) { + throw new Error("No JSON-LD valid script tags present on page"); + } + + return json.length > 1 ? json : json[0]; +}); + /** * Scrapes OpenGraph data given html object * @param {Object} chtml html Cheerio object @@ -555,6 +582,7 @@ exports.metadataFunctions = { 'eprints': exports.parseEprints, 'general': exports.parseGeneral, 'highwirePress': exports.parseHighwirePress, + 'jsonLd': exports.parseJsonLd, 'openGraph': exports.parseOpenGraph, 'schemaOrg': exports.parseSchemaOrgMicrodata, 'twitter': exports.parseTwitter diff --git a/package.json b/package.json index 9298175..e0dfa27 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "html-metadata", - "version": "1.5.0", + "version": "1.6.0", "description": "Scrapes metadata of several different standards", "main": "index.js", "dependencies": { diff --git a/test/errors.js b/test/errors.js index 33c0211..473cc47 100644 --- a/test/errors.js +++ b/test/errors.js @@ -8,6 +8,7 @@ var cheerio = require('cheerio'); var meta = require('../index'); var preq = require('preq'); // Promisified Request library var assert = require('./utils/assert.js'); +var fs = require('fs'); // mocha defines to avoid JSHint breakage @@ -98,6 +99,21 @@ describe('errors', function() { }); }); + it('should not find JSON-LD, reject promise', function() { + var url = 'http://example.com'; + return preq.get(url) + .then(function(callRes) { + var $ = cheerio.load(callRes.body); + var prom = meta.parseJsonLd($); + return assert.fails(prom); + }); + }); + + it('should reject promise with malformed JSON-LD', function() { + var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html')); + return assert.fails(meta.parseJsonLd($)); + }); + //TODO: Add test for lacking general metadata //TODO: Add test for lacking any metadata diff --git a/test/scraping.js b/test/scraping.js index edd1411..9a75259 100644 --- a/test/scraping.js +++ b/test/scraping.js @@ -169,4 +169,33 @@ describe('scraping', function() { }); }); + describe('JSON-LD tests (for types of Organizations)', function() { + var urls = ['http://www.theguardian.com/us', 'http://jsonld.com/', 'http://www.apple.com/']; + urls.forEach(function(test) { + describe(test, function() { + it('should return an object or array', function() { + return meta(test) + .then(function(res) { + assert.ok(typeof res.jsonLd === 'object'); + }); + }); + + it('should get correct JSON-LD data', function() { + return meta(test) + .then(function(res) { + var result = res.jsonLd; + if (res.jsonLd instanceof Array) { + result = res.jsonLd.filter(function(r) { + return r['@type'] === 'Organization'; + })[0]; + }; + ['@context', '@type', 'url', 'logo'].forEach(function(key) { + assert.ok(result.hasOwnProperty(key)); + }); + }); + }); + }); + }); + }); + }); diff --git a/test/static.js b/test/static.js index f3425b1..918a224 100644 --- a/test/static.js +++ b/test/static.js @@ -17,7 +17,7 @@ describe('static files', function() { var expected; it('should get correct info from turtle movie file', function() { - expected = {"dublinCore":{"title":"Turtles of the Jungle","creator":"http://www.example.com/turtlelvr","description":"A 2008 film about jungle turtles.","date":"2012-02-04 12:00:00","type":"Image.Moving"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"openGraph":{"locale":"en_US","type":"video.movie","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","url":"http://example.com","site_name":"Awesome Turtle Movies Website","image":[{"url":"http://example.com/turtle.jpg"},{"url":"http://example.com/shell.jpg"}],"tag":["turtle","movie","awesome"],"director":"http://www.example.com/PhilTheTurtle","actor":["http://www.example.com/PatTheTurtle","http://www.example.com/SaminaTheTurtle"],"writer":"http://www.example.com/TinaTheTurtle","release_date":"2015-01-14T19:14:27+00:00","duration":"1000000"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":"@Turtlessssssssss","url":"http://www.example.com/turtles","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","player":{"url":"http://www.example.com/turtles/player","width":"400","height":"400","stream":{"url":"http://www.example.com/turtles/turtle.mp4","content_type":"video/mp4"}}}}; + expected = JSON.parse(fs.readFileSync('./test/static/turtle_movie.json')); $ = cheerio.load(fs.readFileSync('./test/static/turtle_movie.html')); return meta.parseAll($).then(function(results){ assert.deepEqual(results, expected); @@ -25,7 +25,7 @@ describe('static files', function() { }); it('should get correct info from turtle article file', function() { - expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}}; + expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json')); $ = cheerio.load(fs.readFileSync('./test/static/turtle_article.html')); return meta.parseAll($).then(function(results){ assert.deepEqual(results, expected); @@ -33,8 +33,8 @@ describe('static files', function() { }); it('should be case insensitive on Turtle Article file', function() { - expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}}; - $ = cheerio.load(fs.readFileSync('./test/static/Turtle_Article.html')); + expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json')); + $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_case.html')); return meta.parseAll($).then(function(results){ assert.deepEqual(results, expected); }); diff --git a/test/static/turtle_article.html b/test/static/turtle_article.html index 4b71765..0edd4d8 100644 --- a/test/static/turtle_article.html +++ b/test/static/turtle_article.html @@ -130,6 +130,25 @@ + + + + diff --git a/test/static/turtle_article.json b/test/static/turtle_article.json new file mode 100644 index 0000000..086bedf --- /dev/null +++ b/test/static/turtle_article.json @@ -0,0 +1,130 @@ +{ + "bePress": { + "series_title": "Turtles", + "author": "Turtle Lvr", + "author_institution": "Mediawiki", + "title": "Turtles are AWESOME!!1", + "date": "2012", + "pdf_url": "http://www.example.com/turtlelvr/pdf", + "abstract_html_url": "http://www.example.com/turtlelvr", + "publisher": "Turtles Society", + "online_date": "2012/02/04" + }, + "coins": [{ + "ctx_ver": "Z39.88-2004", + "rft_id": "info:doi/http://dx.doi.org/10.5555/12345678", + "rfr_id": "info:sid/crossref.org:search", + "rft_val_fmt": "info:ofi/fmt:kev:mtx:journal", + "rft": { + "atitle": "Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory", + "jtitle": "Journal of Psychoceramics", + "date": "2008", + "volume": "5", + "issue": "11", + "spage": "1", + "epage": "3", + "aufirst": "Josiah", + "aulast": "Carberry", + "genre": "article", + "au": ["Josiah Carberry"] + } + }], + "dublinCore": { + "title": "Turtles are AWESOME!!1", + "creator": "http://www.example.com/turtlelvr", + "description": "Exposition on the awesomeness of turtles", + "date": "2012-02-04 12:00:00", + "type": "Text.Article" + }, + "general": { + "author": "Turtle Lvr", + "authorlink": "http://examples.com/turtlelvr", + "canonical": "http://example.com/turtles", + "description": "Exposition on the awesomeness of turtles", + "publisher": "https://mediawiki.org", + "robots": "we welcome our robot overlords", + "shortlink": "http://example.com/c", + "title": "Turtles are AWESOME!!1 | Awesome Turtles Website", + "lang": "en" + }, + "highwirePress": { + "journal_title": "Turtles", + "issn": "1234-5678", + "doi": "10.1000/123", + "publication_date": "2012-02-04", + "title": "Turtles are AWESOME!!1", + "author": "Turtle Lvr", + "author_institution": "Mediawiki", + "volume": "150", + "issue": "1", + "firstpage": "123", + "lastpage": "456", + "publisher": "Turtles Society", + "abstract": "Exposition on the awesomeness of turtles." + }, + "jsonLd": { + "@context": "http://schema.org", + "@type": "Organization", + "url": "https://www.turtles.com" + }, + "openGraph": { + "locale": "en_US", + "type": "article", + "title": "Turtles are AWESOME!!1", + "description": "Exposition on the awesomeness of turtles", + "url": "http://example.com", + "site_name": "Awesome Turtles Website", + "image": [{ + "url": "http://example.com/turtle.jpg", + "secure_url": "https://secure.example.com/turtle.jpg", + "type": "image/jpeg", + "width": "400", + "height": "300" + }, { + "url": "http://example.com/shell.jpg", + "width": "200", + "height": "150" + }], + "audio": { + "url": "http://example.com/sound.mp3", + "secure_url": "https://secure.example.com/sound.mp3", + "type": "audio/mpeg" + }, + "tag": ["turtles", "are", "awesome"], + "section": ["Turtles are tough", "Turtles are flawless", "Turtles are cute"], + "published_time": "2012-02-04T12:00:00+00:00", + "modified_time": "2015-01-14T19:14:27+00:00", + "author": "http://examples.com/turtlelvr", + "publisher": "http://mediawiki.org" + }, + "eprints": { + "title": "Turtles are AWESOME!!1", + "creators_name": "http://www.example.com/turtlelvr", + "abstract": "Exposition on the awesomeness of turtles", + "datestamp": "2012-02-04 12:00:00", + "type": "article" + }, + "twitter": { + "card": "summary", + "site": "@Turtlessssssssss", + "creator": ["@Turtlessssssssss", "@Turtlezzzzzzzzzz"], + "url": "http://www.example.com/turtles", + "title": "Turtles are AWESOME!!1", + "description": "Exposition on the awesomeness of turtles", + "image": { + "url": "http://example.com/turtles.jpg", + "alt": "It's a bunch of turtles!" + }, + "app": { + "url": { + "iphone": "turtle://", + "googleplay": "turtle://" + }, + "id": { + "iphone": "000", + "googleplay": "superturtlearticle.androidapp" + } + } + } +} + diff --git a/test/static/Turtle_Article.html b/test/static/turtle_article_case.html similarity index 96% rename from test/static/Turtle_Article.html rename to test/static/turtle_article_case.html index 56ceb53..b9fee6e 100644 --- a/test/static/Turtle_Article.html +++ b/test/static/turtle_article_case.html @@ -130,6 +130,14 @@ + + + diff --git a/test/static/turtle_article_errors.html b/test/static/turtle_article_errors.html new file mode 100644 index 0000000..c5600d4 --- /dev/null +++ b/test/static/turtle_article_errors.html @@ -0,0 +1,18 @@ + + +Turtles are AWESOME!!1 | Invalid Turtles Website + + + + + + + + + diff --git a/test/static/turtle_movie.json b/test/static/turtle_movie.json new file mode 100644 index 0000000..1fef875 --- /dev/null +++ b/test/static/turtle_movie.json @@ -0,0 +1,57 @@ +{ + "dublinCore": { + "title": "Turtles of the Jungle", + "creator": "http://www.example.com/turtlelvr", + "description": "A 2008 film about jungle turtles.", + "date": "2012-02-04 12:00:00", + "type": "Image.Moving" + }, + "general": { + "author": "Turtle Lvr", + "authorlink": "http://examples.com/turtlelvr", + "canonical": "http://example.com/turtles", + "description": "Exposition on the awesomeness of turtles", + "publisher": "https://mediawiki.org", + "robots": "we welcome our robot overlords", + "shortlink": "http://example.com/c", + "title": "Turtles are AWESOME!!1 | Awesome Turtles Website", + "lang": "en" + }, + "openGraph": { + "locale": "en_US", + "type": "video.movie", + "title": "Turtles of the Jungle", + "description": "A 2008 film about jungle turtles.", + "url": "http://example.com", + "site_name": "Awesome Turtle Movies Website", + "image": [{ + "url": "http://example.com/turtle.jpg" + }, { + "url": "http://example.com/shell.jpg" + }], + "tag": ["turtle", "movie", "awesome"], + "director": "http://www.example.com/PhilTheTurtle", + "actor": ["http://www.example.com/PatTheTurtle", "http://www.example.com/SaminaTheTurtle"], + "writer": "http://www.example.com/TinaTheTurtle", + "release_date": "2015-01-14T19:14:27+00:00", + "duration": "1000000" + }, + "twitter": { + "card": "summary", + "site": "@Turtlessssssssss", + "creator": "@Turtlessssssssss", + "url": "http://www.example.com/turtles", + "title": "Turtles of the Jungle", + "description": "A 2008 film about jungle turtles.", + "player": { + "url": "http://www.example.com/turtles/player", + "width": "400", + "height": "400", + "stream": { + "url": "http://www.example.com/turtles/turtle.mp4", + "content_type": "video/mp4" + } + } + } +} +