Skip to content

Commit 740ddd3

Browse files
committed
WIP: add citation, prism, and dc metadata
1 parent 5abeedd commit 740ddd3

File tree

8 files changed

+4160
-23
lines changed

8 files changed

+4160
-23
lines changed

Readability.js

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,14 @@ Readability.prototype = {
13831383
// Strip CDATA markers if present
13841384
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
13851385
var parsed = JSON.parse(content);
1386+
1387+
// some sites, like ones for academic journals, separate metadata for a journal article or paper from the
1388+
// site's own metadata. eg: nature has only @context, @type (WebPage), and mainEntity so *all* relevant metadata
1389+
// would be invisible without this.
1390+
if (parsed["mainEntity"]) {
1391+
parsed = parsed["mainEntity"];
1392+
}
1393+
13861394
if (
13871395
!parsed["@context"] ||
13881396
!parsed["@context"].match(/^https?\:\/\/schema\.org\/?$/)
@@ -1479,7 +1487,7 @@ Readability.prototype = {
14791487
var propertyPattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/gi;
14801488

14811489
// name is a single value
1482-
var namePattern = /^\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-\.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*$/i;
1490+
var namePattern = /^\s*(?:(prism|citation|dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-_\.:]\s*)?(author|creator|pub-date|publicationDate|publication|description|title|site_name)\s*$/i;
14831491

14841492
// Find description tags.
14851493
this._forEachNode(metaElements, function(element) {
@@ -1533,7 +1541,8 @@ Readability.prototype = {
15331541
values["dc:creator"] ||
15341542
values["dcterm:creator"] ||
15351543
values["author"] ||
1536-
values["parsely-author"];
1544+
values["parsely-author"] ||
1545+
values["citation_author"];
15371546

15381547
// get description
15391548
metadata.excerpt = jsonld.excerpt ||
@@ -1553,6 +1562,8 @@ Readability.prototype = {
15531562
metadata.publishedTime = jsonld.datePublished ||
15541563
values["article:published_time"] ||
15551564
values["parsely-pub-date"] ||
1565+
values["citation_publication_date"] ||
1566+
values["prism:publicationDate"] ||
15561567
null;
15571568

15581569
// in many sites the meta value is escaped with HTML entities,

test/generate-testcase.js

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
/* eslint-env node, mocha */
22

3-
var debug = false;
3+
var debug = true;
44

55
var path = require("path");
66
var fs = require("fs");
77
var JSDOM = require("jsdom").JSDOM;
88
var prettyPrint = require("./utils").prettyPrint;
99
var http = require("http");
10+
var https = require("https");
1011
var urlparse = require("url").parse;
1112
var htmltidy = require("htmltidy2").tidy;
1213

@@ -49,38 +50,44 @@ function generateTestcase(slug) {
4950
});
5051
}
5152

52-
function fetchSource(url, callbackFn) {
53-
if (!url) {
54-
console.error("You should pass a URL if the source doesn't exist yet!");
55-
process.exit(1);
56-
return;
57-
}
58-
var client = http;
59-
if (url.indexOf("https") == 0) {
60-
client = require("https");
61-
}
53+
function getWithRedirects(url, cb) {
54+
var client = (url.indexOf("https") == 0) ? https : http;
55+
6256
var options = urlparse(url);
6357
options.headers = {"User-Agent": FFX_UA};
6458

65-
client.get(options, function(response) {
59+
client.get(options, async (response) => {
6660
if (debug) {
6761
console.log("STATUS:", response.statusCode);
6862
console.log("HEADERS:", JSON.stringify(response.headers));
6963
}
64+
65+
if(response.statusCode > 300 && response.statusCode <= 303) {
66+
if (debug) console.log("following redirect", response.headers.location);
67+
await getWithRedirects(response.headers.location, cb);
68+
}
69+
7070
response.setEncoding("utf-8");
7171
var rv = "";
72-
response.on("data", function(chunk) {
73-
rv += chunk;
74-
});
75-
response.on("end", function() {
76-
if (debug) {
77-
console.log("End received");
78-
}
79-
sanitizeSource(rv, callbackFn);
72+
73+
response.on("data", (chunk) => rv += chunk);
74+
75+
response.on("end", () => {
76+
if (debug) console.log("End received");
77+
cb(rv);
8078
});
8179
});
8280
}
8381

82+
function fetchSource(url, callbackFn) {
83+
if (!url) {
84+
console.error("You should pass a URL if the source doesn't exist yet!");
85+
process.exit(1);
86+
}
87+
88+
getWithRedirects(url, (rv) => sanitizeSource(rv, callbackFn));
89+
}
90+
8491
function sanitizeSource(html, callbackFn) {
8592
htmltidy(new JSDOM(html).serialize(), {
8693
"indent": true,
@@ -185,4 +192,4 @@ if (process.argv[2] === "all") {
185192
});
186193
} else {
187194
generateTestcase(process.argv[2]);
188-
}
195+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"title": "Worldwide divergence of values",
3+
"byline": "Medvedev, Danila",
4+
"dir": null,
5+
"lang": "en",
6+
"excerpt": "Social scientists have long debated the nature of cultural change in a modernizing and globalizing world. Some scholars predicted that national cultures would converge by adopting social values typical of Western democracies. Others predicted that cultural differences in values would persist or even increase over time. We test these competing predictions by analyzing survey data from 1981 to 2022 (n = 406,185) from 76 national cultures. We find evidence of global value divergence. Values emphasizing tolerance and self-expression have diverged most sharply, especially between high-income Western countries and the rest of the world. We also find that countries with similar per-capita GDP levels have held similar values over the last 40 years. Over time, however, geographic proximity has emerged as an increasingly strong correlate of value similarity, indicating that values have diverged globally but converged regionally. The authors test whether social values have become converged or diverged across national cultures over the last 40 years using a 76-country analysis of the World Values Survey. They show that values have diverged, especially between high-income Western countries and the rest of the world.",
7+
"siteName": "Nature",
8+
"publishedTime": null,
9+
"readerable": true
10+
}

test/test-pages/nature/expected.html

Lines changed: 702 additions & 0 deletions
Large diffs are not rendered by default.

test/test-pages/nature/source.html

Lines changed: 2622 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"title": "Why do we need to know about progress if we are concerned about the world's largest problems?",
3+
"byline": "By: Max Roser",
4+
"dir": null,
5+
"excerpt": "Why have we made it our mission to publish “research and data to make progress against the world’s largest problems”?",
6+
"siteName": "Our World in Data",
7+
"publishedTime": null,
8+
"readerable": true
9+
}

0 commit comments

Comments
 (0)