Skip to content

Commit a183f22

Browse files
committed
WIP: add citation, prism, and dc metadata
1 parent f5e8701 commit a183f22

File tree

8 files changed

+4160
-23
lines changed

8 files changed

+4160
-23
lines changed

Readability.js

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,14 @@ Readability.prototype = {
13801380
// Strip CDATA markers if present
13811381
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
13821382
var parsed = JSON.parse(content);
1383+
1384+
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1385+
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1386+
// would be invisible without this.
1387+
if (parsed["mainEntity"]) {
1388+
parsed = parsed["mainEntity"];
1389+
}
1390+
13831391
if (
13841392
!parsed["@context"] ||
13851393
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
@@ -1476,7 +1484,7 @@ Readability.prototype = {
14761484
var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
14771485

14781486
// name is a single value
1479-
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1487+
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;
14801488

14811489
// Find description tags.
14821490
this._forEachNode(metaElements, function(element) {
@@ -1530,7 +1538,8 @@ Readability.prototype = {
15301538
values["dc:creator"] ||
15311539
values["dcterm:creator"] ||
15321540
values["author"] ||
1533-
values["parsely-author"];
1541+
values["parsely-author"] ||
1542+
values["citation_author"];
15341543

15351544
// get description
15361545
metadata.excerpt = jsonld.excerpt ||
@@ -1550,6 +1559,8 @@ Readability.prototype = {
15501559
metadata.publishedTime = jsonld.datePublished ||
15511560
values["article:published_time"] ||
15521561
values["parsely-pub-date"] ||
1562+
values["citation_publication_date"] ||
1563+
values["prism:publicationDate"] ||
15531564
null;
15541565

15551566
// in many sites the meta value is escaped with HTML entities,

test/generate-testcase.js

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
/* eslint-env node, mocha */
22

3-
var debug = false;
3+
var debug = true;
44

55
var path = require("path");
66
var fs = require("fs");
77
var JSDOM = require("jsdom").JSDOM;
88
var prettyPrint = require("./utils").prettyPrint;
99
var http = require("http");
10+
var https = require("https");
1011
var urlparse = require("url").parse;
1112
var htmltidy = require("htmltidy2").tidy;
1213

@@ -49,38 +50,44 @@ function generateTestcase(slug) {
4950
});
5051
}
5152

52-
function fetchSource(url, callbackFn) {
53-
if (!url) {
54-
console.error("You should pass a URL if the source doesn't exist yet!");
55-
process.exit(1);
56-
return;
57-
}
58-
var client = http;
59-
if (url.indexOf("https") == 0) {
60-
client = require("https");
61-
}
53+
function getWithRedirects(url, cb) {
54+
var client = (url.indexOf("https") == 0) ? https : http;
55+
6256
var options = urlparse(url);
6357
options.headers = {"User-Agent": FFX_UA};
6458

65-
client.get(options, function(response) {
59+
client.get(options, async (response) => {
6660
if (debug) {
6761
console.log("STATUS:", response.statusCode);
6862
console.log("HEADERS:", JSON.stringify(response.headers));
6963
}
64+
65+
if(response.statusCode > 300 && response.statusCode <= 303) {
66+
if (debug) console.log("following redirect", response.headers.location);
67+
await getWithRedirects(response.headers.location, cb);
68+
}
69+
7070
response.setEncoding("utf-8");
7171
var rv = "";
72-
response.on("data", function(chunk) {
73-
rv += chunk;
74-
});
75-
response.on("end", function() {
76-
if (debug) {
77-
console.log("End received");
78-
}
79-
sanitizeSource(rv, callbackFn);
72+
73+
response.on("data", (chunk) => rv += chunk);
74+
75+
response.on("end", () => {
76+
if (debug) console.log("End received");
77+
cb(rv);
8078
});
8179
});
8280
}
8381

82+
function fetchSource(url, callbackFn) {
83+
if (!url) {
84+
console.error("You should pass a URL if the source doesn't exist yet!");
85+
process.exit(1);
86+
}
87+
88+
getWithRedirects(url, (rv) => sanitizeSource(rv, callbackFn));
89+
}
90+
8491
function sanitizeSource(html, callbackFn) {
8592
htmltidy(new JSDOM(html).serialize(), {
8693
"indent": true,
@@ -185,4 +192,4 @@ if (process.argv[2] === "all") {
185192
});
186193
} else {
187194
generateTestcase(process.argv[2]);
188-
}
195+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"title": "Worldwide divergence of values",
3+
"byline": "Medvedev, Danila",
4+
"dir": null,
5+
"lang": "en",
6+
"excerpt": "Social scientists have long debated the nature of cultural change in a modernizing and globalizing world. Some scholars predicted that national cultures would converge by adopting social values typical of Western democracies. Others predicted that cultural differences in values would persist or even increase over time. We test these competing predictions by analyzing survey data from 1981 to 2022 (n = 406,185) from 76 national cultures. We find evidence of global value divergence. Values emphasizing tolerance and self-expression have diverged most sharply, especially between high-income Western countries and the rest of the world. We also find that countries with similar per-capita GDP levels have held similar values over the last 40 years. Over time, however, geographic proximity has emerged as an increasingly strong correlate of value similarity, indicating that values have diverged globally but converged regionally. The authors test whether social values have become converged or diverged across national cultures over the last 40 years using a 76-country analysis of the World Values Survey. They show that values have diverged, especially between high-income Western countries and the rest of the world.",
7+
"siteName": "Nature",
8+
"publishedTime": null,
9+
"readerable": true
10+
}

test/test-pages/nature/expected.html

Lines changed: 702 additions & 0 deletions
Large diffs are not rendered by default.

test/test-pages/nature/source.html

Lines changed: 2622 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"title": "Why do we need to know about progress if we are concerned about the world's largest problems?",
3+
"byline": "By: Max Roser",
4+
"dir": null,
5+
"excerpt": "Why have we made it our mission to publish “research and data to make progress against the world’s largest problems”?",
6+
"siteName": "Our World in Data",
7+
"publishedTime": null,
8+
"readerable": true
9+
}

0 commit comments

Comments
 (0)