Skip to content

Commit

Permalink
Reordered functions and fixed testing
Browse files Browse the repository at this point in the history
Fixed ordering

Version bump, Uber->Apple test

Line endings
  • Loading branch information
ethanlee16 committed Dec 8, 2016
1 parent 261a40c commit 6b1ae4c
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 51 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ html-metadata

# MetaData html scraper and parser for Node.js (supports Promises and callback style)

The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).
The aim of this library is to be a comprehensive source for extracting all html embedded metadata. Currently it supports Schema.org microdata using a third party library, a native BEPress, Dublin Core, Highwire Press, JSON-LD, Open Graph, Twitter, EPrints, and COinS implementation, and some general metadata that doesn't belong to a particular standard (for instance, the content of the title tag, or meta description tags).

Planned is support for RDFa, AGLS, and other yet unheard of metadata types. Contributions and requests for other metadata types welcome!

Expand Down
22 changes: 11 additions & 11 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,17 @@ exports.parseHighwirePress = function(chtml, callback){
return index.parseHighwirePress(chtml).nodeify(callback);
};

/**
* Retrieves JSON-LD for given html object
*
* @param {Object} chtml html Cheerio object
* @param {Function} [callback] optional callback function
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = function(chtml, callback){
return index.parseJsonLd(chtml).nodeify(callback);
};

/**
* Scrapes OpenGraph data given html object
*
Expand Down Expand Up @@ -154,17 +165,6 @@ exports.parseTwitter = function(chtml, callback){
return index.parseTwitter(chtml).nodeify(callback);
};

/**
* Retrieves JSON-LD for given html object
*
* @param {Object} chtml html Cheerio object
* @param {Function} [callback] optional callback function
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = function(chtml, callback){
return index.parseJsonLd(chtml).nodeify(callback);
};

/**
* Global exportable list of scraping promises with string keys
* @type {Object}
Expand Down
58 changes: 29 additions & 29 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,33 @@ exports.parseHighwirePress = BBPromise.method(function(chtml){

});

/**
* Returns JSON-LD provided by page given HTML object
* @param {Object} chtml html Cheerio object
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = BBPromise.method(function(chtml) {
var json = [];
var jsonLd = chtml('script[type="application/ld+json"]');

jsonLd.each(function() {
var contents = chtml(this).text().trim();
try {
contents = JSON.parse(contents);
} catch (e) {
// Fail silently, just in case there are valid tags
return;
}
json.push(contents);
});

if (json.length === 0) {
throw new Error("No JSON-LD valid script tags present on page");
}

return json.length > 1 ? json : json[0];
});

/**
* Scrapes OpenGraph data given html object
* @param {Object} chtml html Cheerio object
Expand Down Expand Up @@ -544,33 +571,6 @@ exports.parseTwitter = BBPromise.method(function(chtml) {
});


/**
* Returns JSON-LD provided by page given HTML object
* @param {Object} chtml html Cheerio object
* @return {Object} BBPromise for JSON-LD
*/
exports.parseJsonLd = BBPromise.method(function(chtml) {
var json = [];
var jsonLd = chtml('script[type="application/ld+json"]');

if (jsonLd.length === 0) {
throw new Error("No JSON-LD script tag present on page");
}

jsonLd.each(function() {
var contents = chtml(this).text().trim();
try {
contents = JSON.parse(contents);
} catch (e) {
throw new Error("Could not parse JSON-LD: " + e);
}
json.push(contents);
});

return jsonLd.length > 1 ? json : json[0];
});


/**
* Global exportable list of scraping promises with string keys
* @type {Object}
Expand All @@ -582,8 +582,8 @@ exports.metadataFunctions = {
'eprints': exports.parseEprints,
'general': exports.parseGeneral,
'highwirePress': exports.parseHighwirePress,
'jsonLd': exports.parseJsonLd,
'openGraph': exports.parseOpenGraph,
'schemaOrg': exports.parseSchemaOrgMicrodata,
'twitter': exports.parseTwitter,
'jsonLd': exports.parseJsonLd
'twitter': exports.parseTwitter
};
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "html-metadata",
"version": "1.5.0",
"version": "1.6.0",
"description": "Scrapes metadata of several different standards",
"main": "index.js",
"dependencies": {
Expand Down
2 changes: 1 addition & 1 deletion test/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ describe('errors', function() {
});

it('should reject promise with malformed JSON-LD', function() {
var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article.html'));
var $ = cheerio.load(fs.readFileSync('./test/static/turtle_article_errors.html'));
return assert.fails(meta.parseJsonLd($));
});

Expand Down
4 changes: 2 additions & 2 deletions test/scraping.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ describe('scraping', function() {
});

describe('JSON-LD tests (for types of Organizations)', function() {
var urls = ['http://www.uber.com/en-GB/', 'http://www.theguardian.com/us', 'http://jsonld.com/'];
var urls = ['http://www.theguardian.com/us', 'http://jsonld.com/', 'http://www.apple.com/'];
urls.forEach(function(test) {
describe(test, function() {
it('should return an object or array', function() {
Expand All @@ -186,7 +186,7 @@ describe('scraping', function() {
var result = res.jsonLd;
if (res.jsonLd instanceof Array) {
result = res.jsonLd.filter(function(r) {
return r['@type'] === 'Organization'
return r['@type'] === 'Organization';
})[0];
};
['@context', '@type', 'url', 'logo'].forEach(function(key) {
Expand Down
6 changes: 3 additions & 3 deletions test/static.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ describe('static files', function() {
var expected;

it('should get correct info from turtle movie file', function() {
expected = {"dublinCore":{"title":"Turtles of the Jungle","creator":"http://www.example.com/turtlelvr","description":"A 2008 film about jungle turtles.","date":"2012-02-04 12:00:00","type":"Image.Moving"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"openGraph":{"locale":"en_US","type":"video.movie","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","url":"http://example.com","site_name":"Awesome Turtle Movies Website","image":[{"url":"http://example.com/turtle.jpg"},{"url":"http://example.com/shell.jpg"}],"tag":["turtle","movie","awesome"],"director":"http://www.example.com/PhilTheTurtle","actor":["http://www.example.com/PatTheTurtle","http://www.example.com/SaminaTheTurtle"],"writer":"http://www.example.com/TinaTheTurtle","release_date":"2015-01-14T19:14:27+00:00","duration":"1000000"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":"@Turtlessssssssss","url":"http://www.example.com/turtles","title":"Turtles of the Jungle","description":"A 2008 film about jungle turtles.","player":{"url":"http://www.example.com/turtles/player","width":"400","height":"400","stream":{"url":"http://www.example.com/turtles/turtle.mp4","content_type":"video/mp4"}}}};
expected = JSON.parse(fs.readFileSync('./test/static/turtle_movie.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_movie.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
});
});

it('should get correct info from turtle article file', function() {
expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}};
expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_article.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
});
});

it('should be case insensitive on Turtle Article file', function() {
expected = {"bePress":{"series_title":"Turtles","author":"Turtle Lvr","author_institution":"Mediawiki","title":"Turtles are AWESOME!!1","date":"2012","pdf_url":"http://www.example.com/turtlelvr/pdf","abstract_html_url":"http://www.example.com/turtlelvr","publisher":"Turtles Society","online_date":"2012/02/04"},"coins":[{"ctx_ver":"Z39.88-2004","rft_id":"info:doi/http://dx.doi.org/10.5555/12345678","rfr_id":"info:sid/crossref.org:search","rft_val_fmt":"info:ofi/fmt:kev:mtx:journal","rft":{"atitle":"Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory","jtitle":"Journal of Psychoceramics","date":"2008","volume":"5","issue":"11","spage":"1","epage":"3","aufirst":"Josiah","aulast":"Carberry","genre":"article","au":["Josiah Carberry"]}}],"dublinCore":{"title":"Turtles are AWESOME!!1","creator":"http://www.example.com/turtlelvr","description":"Exposition on the awesomeness of turtles","date":"2012-02-04 12:00:00","type":"Text.Article"},"general":{"author":"Turtle Lvr","authorlink":"http://examples.com/turtlelvr","canonical":"http://example.com/turtles","description":"Exposition on the awesomeness of turtles","publisher":"https://mediawiki.org","robots":"we welcome our robot overlords","shortlink":"http://example.com/c","title":"Turtles are AWESOME!!1 | Awesome Turtles Website", "lang":"en"},"highwirePress":{"journal_title":"Turtles","issn":"1234-5678","doi":"10.1000/123","publication_date":"2012-02-04","title":"Turtles are AWESOME!!1","author":"Turtle Lvr","author_institution":"Mediawiki","volume":"150","issue":"1","firstpage":"123","lastpage":"456","publisher":"Turtles Society","abstract":"Exposition on the awesomeness of turtles."},"openGraph":{"locale":"en_US","type":"article","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","url":"http://example.com","site_name":"Awesome Turtles Website","image":[{"url":"http://example.com/turtle.jpg","secure_url":"https://secure.example.com/turtle.jpg","type":"image/jpeg","width":"400","height":"300"},{"url":"http://example.com/shell.jpg","width":"200","height":"150"}],"audio":{"url":"http://example.com/sound.mp3","secure_url":"https://secure.example.com/sound.mp3","type":"audio/mpeg"},"tag":["turtles","are","awesome"],"section":["Turtles are tough","Turtles are flawless","Turtles are cute"],"published_time":"2012-02-04T12:00:00+00:00","modified_time":"2015-01-14T19:14:27+00:00","author":"http://examples.com/turtlelvr","publisher":"http://mediawiki.org"},"eprints":{"title":"Turtles are AWESOME!!1","creators_name":"http://www.example.com/turtlelvr","abstract":"Exposition on the awesomeness of turtles","datestamp":"2012-02-04 12:00:00","type":"article"},"twitter":{"card":"summary","site":"@Turtlessssssssss","creator":["@Turtlessssssssss","@Turtlezzzzzzzzzz"],"url":"http://www.example.com/turtles","title":"Turtles are AWESOME!!1","description":"Exposition on the awesomeness of turtles","image":{"url":"http://example.com/turtles.jpg","alt":"It's a bunch of turtles!"},"app":{"url":{"iphone":"turtle://","googleplay":"turtle://"},"id":{"iphone":"000","googleplay":"superturtlearticle.androidapp"}}}};
expected = JSON.parse(fs.readFileSync('./test/static/turtle_article.json'));
$ = cheerio.load(fs.readFileSync('./test/static/turtle_article_case.html'));
return meta.parseAll($).then(function(results){
assert.deepEqual(results, expected);
Expand Down
18 changes: 15 additions & 3 deletions test/static/turtle_article.html
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,22 @@

<span class="Z3988" title="ctx_ver=Z39.88-2004&amp;rft_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;rft.atitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;rft.jtitle=Journal+of+Psychoceramics&amp;rft.date=2008&amp;rft.volume=5&amp;rft.issue=11&amp;rft.spage=1&amp;rft.epage=3&amp;rft.aufirst=Josiah&amp;rft.aulast=Carberry&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.au=Josiah+Carberry"></span>

<script type="application/ld+json">{
"@context" / "http://schema.org",
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.uber.com"
"url": "https://www.turtles.com"
}
</script>
<!-- ignored -->
<script type="application/ld+json">
{
"@id": "https://www.turtles.com/"
"potentialAction" / {
"@type": "ViewAction",
"target": "android-app://com.turtles/"
},
"@type": "WebPage",
"@context": "http://schema.org"
}
</script>

Expand Down
130 changes: 130 additions & 0 deletions test/static/turtle_article.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"bePress": {
"series_title": "Turtles",
"author": "Turtle Lvr",
"author_institution": "Mediawiki",
"title": "Turtles are AWESOME!!1",
"date": "2012",
"pdf_url": "http://www.example.com/turtlelvr/pdf",
"abstract_html_url": "http://www.example.com/turtlelvr",
"publisher": "Turtles Society",
"online_date": "2012/02/04"
},
"coins": [{
"ctx_ver": "Z39.88-2004",
"rft_id": "info:doi/http://dx.doi.org/10.5555/12345678",
"rfr_id": "info:sid/crossref.org:search",
"rft_val_fmt": "info:ofi/fmt:kev:mtx:journal",
"rft": {
"atitle": "Toward a Unified Theory of High-Energy Metaphysics: Silly String Theory",
"jtitle": "Journal of Psychoceramics",
"date": "2008",
"volume": "5",
"issue": "11",
"spage": "1",
"epage": "3",
"aufirst": "Josiah",
"aulast": "Carberry",
"genre": "article",
"au": ["Josiah Carberry"]
}
}],
"dublinCore": {
"title": "Turtles are AWESOME!!1",
"creator": "http://www.example.com/turtlelvr",
"description": "Exposition on the awesomeness of turtles",
"date": "2012-02-04 12:00:00",
"type": "Text.Article"
},
"general": {
"author": "Turtle Lvr",
"authorlink": "http://examples.com/turtlelvr",
"canonical": "http://example.com/turtles",
"description": "Exposition on the awesomeness of turtles",
"publisher": "https://mediawiki.org",
"robots": "we welcome our robot overlords",
"shortlink": "http://example.com/c",
"title": "Turtles are AWESOME!!1 | Awesome Turtles Website",
"lang": "en"
},
"highwirePress": {
"journal_title": "Turtles",
"issn": "1234-5678",
"doi": "10.1000/123",
"publication_date": "2012-02-04",
"title": "Turtles are AWESOME!!1",
"author": "Turtle Lvr",
"author_institution": "Mediawiki",
"volume": "150",
"issue": "1",
"firstpage": "123",
"lastpage": "456",
"publisher": "Turtles Society",
"abstract": "Exposition on the awesomeness of turtles."
},
"jsonLd": {
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.turtles.com"
},
"openGraph": {
"locale": "en_US",
"type": "article",
"title": "Turtles are AWESOME!!1",
"description": "Exposition on the awesomeness of turtles",
"url": "http://example.com",
"site_name": "Awesome Turtles Website",
"image": [{
"url": "http://example.com/turtle.jpg",
"secure_url": "https://secure.example.com/turtle.jpg",
"type": "image/jpeg",
"width": "400",
"height": "300"
}, {
"url": "http://example.com/shell.jpg",
"width": "200",
"height": "150"
}],
"audio": {
"url": "http://example.com/sound.mp3",
"secure_url": "https://secure.example.com/sound.mp3",
"type": "audio/mpeg"
},
"tag": ["turtles", "are", "awesome"],
"section": ["Turtles are tough", "Turtles are flawless", "Turtles are cute"],
"published_time": "2012-02-04T12:00:00+00:00",
"modified_time": "2015-01-14T19:14:27+00:00",
"author": "http://examples.com/turtlelvr",
"publisher": "http://mediawiki.org"
},
"eprints": {
"title": "Turtles are AWESOME!!1",
"creators_name": "http://www.example.com/turtlelvr",
"abstract": "Exposition on the awesomeness of turtles",
"datestamp": "2012-02-04 12:00:00",
"type": "article"
},
"twitter": {
"card": "summary",
"site": "@Turtlessssssssss",
"creator": ["@Turtlessssssssss", "@Turtlezzzzzzzzzz"],
"url": "http://www.example.com/turtles",
"title": "Turtles are AWESOME!!1",
"description": "Exposition on the awesomeness of turtles",
"image": {
"url": "http://example.com/turtles.jpg",
"alt": "It's a bunch of turtles!"
},
"app": {
"url": {
"iphone": "turtle://",
"googleplay": "turtle://"
},
"id": {
"iphone": "000",
"googleplay": "superturtlearticle.androidapp"
}
}
}
}

8 changes: 8 additions & 0 deletions test/static/turtle_article_case.html
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@

<body>

<!-- since keys may be case-sensitive in JSON-LD, take the keys as-is -->
<script type="application/ld+json">{
"@context": "http://schema.org",
"@type": "Organization",
"url": "https://www.turtles.com"
}
</script>

<span class="Z3988" Title="ctx_ver=Z39.88-2004&amp;RFT_id=info%3Adoi%2Fhttp%3A%2F%2Fdx.doi.org%2F10.5555%2F12345678&amp;rfr_id=info%3Asid%2Fcrossref.org%3Asearch&amp;RFT.aTitle=Toward+a+Unified+Theory+of+High-Energy+Metaphysics%3A+Silly+String+Theory&amp;RFT.jTitle=Journal+of+Psychoceramics&amp;RFT.Date=2008&amp;RFT.Volume=5&amp;RFT.issue=11&amp;RFT.Spage=1&amp;RFT.Epage=3&amp;RFT.Aufirst=Josiah&amp;RFT.Aulast=Carberry&amp;RFT_vAL_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;RFT.Genre=Article&amp;RFT.Au=Josiah+Carberry"></span>

</body>
Expand Down
Loading

0 comments on commit 6b1ae4c

Please sign in to comment.