@@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
1540
1540
// the src attribute so the images are no longer lazy loaded.
1541
1541
1542
1542
function convertLazyLoadedImages ( $ ) {
1543
+ var extractSrcFromJSON = function extractSrcFromJSON ( str ) {
1544
+ try {
1545
+ var _JSON$parse = JSON . parse ( str ) ,
1546
+ src = _JSON$parse . src ;
1547
+
1548
+ if ( typeof src === 'string' ) return src ;
1549
+ } catch ( e ) {
1550
+ return false ;
1551
+ }
1552
+
1553
+ return false ;
1554
+ } ;
1555
+
1543
1556
$ ( 'img' ) . each ( function ( _ , img ) {
1544
1557
var attrs = getAttrs ( img ) ;
1545
1558
@@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
1549
1562
if ( attr !== 'srcset' && IS_LINK . test ( value ) && IS_SRCSET . test ( value ) ) {
1550
1563
$ ( img ) . attr ( 'srcset' , value ) ;
1551
1564
} else if ( attr !== 'src' && attr !== 'srcset' && IS_LINK . test ( value ) && IS_IMAGE . test ( value ) ) {
1552
- $ ( img ) . attr ( 'src' , value ) ;
1565
+ // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
1566
+ var existingSrc = extractSrcFromJSON ( value ) ;
1567
+
1568
+ if ( existingSrc ) {
1569
+ $ ( img ) . attr ( 'src' , existingSrc ) ;
1570
+ } else {
1571
+ $ ( img ) . attr ( 'src' , value ) ;
1572
+ }
1553
1573
}
1554
1574
} ) ;
1555
1575
} ) ;
@@ -2388,6 +2408,14 @@ var MediumExtractor = {
2388
2408
// Is there anything in the content you selected that needs transformed
2389
2409
// before it's consumable content? E.g., unusual lazy loaded images
2390
2410
transforms : {
2411
+ // Allow drop cap character.
2412
+ 'section span:first-of-type' : function sectionSpanFirstOfType ( $node ) {
2413
+ var $text = $node . html ( ) ;
2414
+
2415
+ if ( $text . length === 1 && / ^ [ a - z A - Z ( ) ] + $ / . test ( $text ) ) {
2416
+ $node . replaceWith ( $text ) ;
2417
+ }
2418
+ } ,
2391
2419
// Re-write lazy-loaded youtube videos
2392
2420
iframe : function iframe ( $node ) {
2393
2421
var ytRe = / h t t p s : \/ \/ i .e m b e d .l y \/ .+ u r l = h t t p s : \/ \/ i \. y t i m g \. c o m \/ v i \/ ( \w + ) \/ / ;
@@ -2429,7 +2457,7 @@ var MediumExtractor = {
2429
2457
// Is there anything that is in the result that shouldn't be?
2430
2458
// The clean selectors will remove anything that matches from
2431
2459
// the result
2432
- clean : [ 'span' , 'svg' ]
2460
+ clean : [ 'span a ' , 'svg' ]
2433
2461
} ,
2434
2462
date_published : {
2435
2463
selectors : [ [ 'meta[name="article:published_time"]' , 'value' ] ]
@@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
6411
6439
format = _ref . format ;
6412
6440
6413
6441
// If string is in milliseconds or seconds, convert to int and return
6414
- if ( MS_DATE_STRING . test ( dateString ) || SEC_DATE_STRING . test ( dateString ) ) {
6442
+ if ( MS_DATE_STRING . test ( dateString ) ) {
6415
6443
return new Date ( _parseInt ( dateString , 10 ) ) . toISOString ( ) ;
6416
6444
}
6417
6445
6446
+ if ( SEC_DATE_STRING . test ( dateString ) ) {
6447
+ return new Date ( _parseInt ( dateString , 10 ) * 1000 ) . toISOString ( ) ;
6448
+ }
6449
+
6418
6450
var date = createDate ( dateString , timezone , format ) ;
6419
6451
6420
6452
if ( ! date . isValid ( ) ) {
@@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
7546
7578
}
7547
7579
} ;
7548
7580
7581
+ var getWordCount = function getWordCount ( content ) {
7582
+ var $ = cheerio . load ( content ) ;
7583
+ var $content = $ ( 'div' ) . first ( ) ;
7584
+ var text = normalizeSpaces ( $content . text ( ) ) ;
7585
+ return text . split ( / \s / ) . length ;
7586
+ } ;
7587
+
7588
+ var getWordCountAlt = function getWordCountAlt ( content ) {
7589
+ content = content . replace ( / < [ ^ > ] * > / g, ' ' ) ;
7590
+ content = content . replace ( / \s + / g, ' ' ) ;
7591
+ content = content . trim ( ) ;
7592
+ return content . split ( ' ' ) . length ;
7593
+ } ;
7594
+
7549
7595
var GenericWordCountExtractor = {
7550
7596
extract : function extract ( _ref ) {
7551
7597
var content = _ref . content ;
7552
- var $ = cheerio . load ( content ) ;
7553
- var $content = $ ( 'div' ) . first ( ) ;
7554
- var text = normalizeSpaces ( $content . text ( ) ) ;
7555
- return text . split ( / \s / ) . length ;
7598
+ var count = getWordCount ( content ) ;
7599
+ if ( count === 1 ) count = getWordCountAlt ( content ) ;
7600
+ return count ;
7556
7601
}
7557
7602
} ;
7558
7603
@@ -7715,7 +7760,8 @@ function select(opts) {
7715
7760
_extractionOpts$defau = extractionOpts . defaultCleaner ,
7716
7761
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau ,
7717
7762
allowMultiple = extractionOpts . allowMultiple ;
7718
- var matchingSelector = findMatchingSelector ( $ , selectors , extractHtml , allowMultiple ) ;
7763
+ var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple ;
7764
+ var matchingSelector = findMatchingSelector ( $ , selectors , extractHtml , overrideAllowMultiple ) ;
7719
7765
if ( ! matchingSelector ) return null ;
7720
7766
7721
7767
function transformAndClean ( $node ) {
@@ -7988,7 +8034,7 @@ function _collectAllPages() {
7988
8034
} ) ;
7989
8035
return _context . abrupt ( "return" , _objectSpread ( { } , result , {
7990
8036
total_pages : pages ,
7991
- pages_rendered : pages ,
8037
+ rendered_pages : pages ,
7992
8038
word_count : word_count
7993
8039
} ) ) ;
7994
8040
0 commit comments