@@ -13,40 +13,39 @@ class CybernewsBridge extends BridgeAbstract
1313
1414 public function collectData ()
1515 {
16- $ sitemapXml = getContents ( self ::URI . '/news-sitemap.xml ' ) ;
16+ $ sitemapUrl = self ::URI . '/news-sitemap.xml ' ;
1717
18+ $ sitemapXml = getContents ($ sitemapUrl );
1819 if (!$ sitemapXml ) {
1920 throwServerException ('Unable to retrieve Cybernews sitemap ' );
2021 }
2122
22- $ sitemap = simplexml_load_string ($ sitemapXml , null , LIBXML_NOCDATA );
23-
23+ $ sitemap = simplexml_load_string ($ sitemapXml , null , LIBXML_NOCDATA | LIBXML_NONET );
2424 if (!$ sitemap ) {
2525 throwServerException ('Unable to parse Cybernews sitemap ' );
2626 }
2727
2828 foreach ($ sitemap ->url as $ entry ) {
29- $ url = trim ((string ) $ entry ->loc );
30- $ lastmod = trim ((string ) $ entry ->lastmod );
29+ $ url = trim ((string ) $ entry ->loc );
30+ $ lastmod = trim ((string ) $ entry ->lastmod );
3131
3232 if (!$ url ) {
3333 continue ;
3434 }
3535
36- $ pathParts = explode ('/ ' , trim (parse_url ($ url , PHP_URL_PATH ), '/ ' ));
37- $ category = isset ($ pathParts [0 ]) && $ pathParts [0 ] !== '' ? $ pathParts [0 ] : '' ;
36+ $ pathParts = explode ('/ ' , trim (parse_url ($ url , PHP_URL_PATH ), '/ ' ));
37+ $ category = isset ($ pathParts [0 ]) && $ pathParts [0 ] !== '' ? $ pathParts [0 ] : '' ;
3838
3939 // Skip non-English versions
40- if (in_array ($ category , ['nl ' , 'de ' ], true )) {
41- continue ;
42- }
40+ // if (in_array($category, ['nl', 'de', 'es', 'it '], true)) {
41+ // continue;
42+ // }
4343
4444 $ namespaces = $ entry ->getNamespaces (true );
4545 $ title = '' ;
4646
4747 if (isset ($ namespaces ['news ' ])) {
4848 $ news = $ entry ->children ($ namespaces ['news ' ])->news ;
49-
5049 if ($ news ) {
5150 $ title = trim ((string ) $ news ->title );
5251 }
@@ -74,18 +73,15 @@ public function collectData()
7473 private function fetchFullArticle (string $ url ): string
7574 {
7675 $ html = getSimpleHTMLDOMCached ($ url );
77-
7876 if (!$ html ) {
7977 return 'Unable to fetch article content ' ;
8078 }
8179
8280 $ article = $ html ->find ('article ' , 0 );
83-
8481 if (!$ article ) {
8582 return 'Unable to parse article content ' ;
8683 }
8784
88- // Remove unnecessary elements
8985 $ removeSelectors = [
9086 'script ' ,
9187 'style ' ,
0 commit comments