Skip to content

Commit ad8d4aa

Browse files
authored
release: 2.2.3 (#703)
1 parent 635fcf6 commit ad8d4aa

File tree

4 files changed

+64
-11
lines changed

4 files changed

+64
-11
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Mercury Parser Changelog
22

3+
### 2.2.3 (Oct 24, 2022)
4+
5+
- [[`635fcf6356`](https://github.com/postlight/parser/commit/635fcf6356)] - **fix**: handle sec & ms timestamps properly (#702) (Austin)
6+
- [[`ab401822aa`](https://github.com/postlight/parser/commit/ab401822aa)] - maintenance update - october 2022 (#696) (Michael Ashley)
7+
- [[`8ca8a5f7e5`](https://github.com/postlight/parser/commit/8ca8a5f7e5)] - **feat**: add postlight.com custom extractor (#695) (Sarah Doire)
8+
- [[`39b9ff55c4`](https://github.com/postlight/parser/commit/39b9ff55c4)] - **release**: 2.2.2 (#689) (John Holdun)
9+
310
### 2.2.2 (Sept 08, 2022)
411

512
##### Commits

dist/mercury.js

+55-9
Original file line numberDiff line numberDiff line change
@@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
15401540
// the src attribute so the images are no longer lazy loaded.
15411541

15421542
function convertLazyLoadedImages($) {
1543+
var extractSrcFromJSON = function extractSrcFromJSON(str) {
1544+
try {
1545+
var _JSON$parse = JSON.parse(str),
1546+
src = _JSON$parse.src;
1547+
1548+
if (typeof src === 'string') return src;
1549+
} catch (e) {
1550+
return false;
1551+
}
1552+
1553+
return false;
1554+
};
1555+
15431556
$('img').each(function (_, img) {
15441557
var attrs = getAttrs(img);
15451558

@@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
15491562
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
15501563
$(img).attr('srcset', value);
15511564
} else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
1552-
$(img).attr('src', value);
1565+
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
1566+
var existingSrc = extractSrcFromJSON(value);
1567+
1568+
if (existingSrc) {
1569+
$(img).attr('src', existingSrc);
1570+
} else {
1571+
$(img).attr('src', value);
1572+
}
15531573
}
15541574
});
15551575
});
@@ -2388,6 +2408,14 @@ var MediumExtractor = {
23882408
// Is there anything in the content you selected that needs transformed
23892409
// before it's consumable content? E.g., unusual lazy loaded images
23902410
transforms: {
2411+
// Allow drop cap character.
2412+
'section span:first-of-type': function sectionSpanFirstOfType($node) {
2413+
var $text = $node.html();
2414+
2415+
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
2416+
$node.replaceWith($text);
2417+
}
2418+
},
23912419
// Re-write lazy-loaded youtube videos
23922420
iframe: function iframe($node) {
23932421
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@@ -2429,7 +2457,7 @@ var MediumExtractor = {
24292457
// Is there anything that is in the result that shouldn't be?
24302458
// The clean selectors will remove anything that matches from
24312459
// the result
2432-
clean: ['span', 'svg']
2460+
clean: ['span a', 'svg']
24332461
},
24342462
date_published: {
24352463
selectors: [['meta[name="article:published_time"]', 'value']]
@@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
64116439
format = _ref.format;
64126440

64136441
// If string is in milliseconds or seconds, convert to int and return
6414-
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
6442+
if (MS_DATE_STRING.test(dateString)) {
64156443
return new Date(_parseInt(dateString, 10)).toISOString();
64166444
}
64176445

6446+
if (SEC_DATE_STRING.test(dateString)) {
6447+
return new Date(_parseInt(dateString, 10) * 1000).toISOString();
6448+
}
6449+
64186450
var date = createDate(dateString, timezone, format);
64196451

64206452
if (!date.isValid()) {
@@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
75467578
}
75477579
};
75487580

7581+
var getWordCount = function getWordCount(content) {
7582+
var $ = cheerio.load(content);
7583+
var $content = $('div').first();
7584+
var text = normalizeSpaces($content.text());
7585+
return text.split(/\s/).length;
7586+
};
7587+
7588+
var getWordCountAlt = function getWordCountAlt(content) {
7589+
content = content.replace(/<[^>]*>/g, ' ');
7590+
content = content.replace(/\s+/g, ' ');
7591+
content = content.trim();
7592+
return content.split(' ').length;
7593+
};
7594+
75497595
var GenericWordCountExtractor = {
75507596
extract: function extract(_ref) {
75517597
var content = _ref.content;
7552-
var $ = cheerio.load(content);
7553-
var $content = $('div').first();
7554-
var text = normalizeSpaces($content.text());
7555-
return text.split(/\s/).length;
7598+
var count = getWordCount(content);
7599+
if (count === 1) count = getWordCountAlt(content);
7600+
return count;
75567601
}
75577602
};
75587603

@@ -7715,7 +7760,8 @@ function select(opts) {
77157760
_extractionOpts$defau = extractionOpts.defaultCleaner,
77167761
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
77177762
allowMultiple = extractionOpts.allowMultiple;
7718-
var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
7763+
var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
7764+
var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
77197765
if (!matchingSelector) return null;
77207766

77217767
function transformAndClean($node) {
@@ -7988,7 +8034,7 @@ function _collectAllPages() {
79888034
});
79898035
return _context.abrupt("return", _objectSpread({}, result, {
79908036
total_pages: pages,
7991-
pages_rendered: pages,
8037+
rendered_pages: pages,
79928038
word_count: word_count
79938039
}));
79948040

dist/mercury.web.js

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@postlight/parser",
3-
"version": "2.2.2",
3+
"version": "2.2.3",
44
"description": "Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
55
"author": "Postlight <[email protected]>",
66
"homepage": "https://reader.postlight.com",

0 commit comments

Comments
 (0)