Skip to content

Commit

Permalink
Fix: Relative links (#419)
Browse files Browse the repository at this point in the history
* Fix: Relative URLs in links

Feed discovery with relative URLs in links
Example: https://k47.cz/
Solution to the problem: #417

* Update Feed.php (Fix relative URLs in links)

We will replace the links (add the host if it doesn’t exist) in the content as well

* Update Node.php (Fix relative URLs in links)

* Update Reader.php (Fix relative URLs in links)

Remember the URL for further processing

* Update Link.php (Fix relative URLs in links)

* Update XmlParser.php (Fix relative URLs in links)

* Update Explorer.php

* Update Node.php

Delete tabs
  • Loading branch information
IgorA100 authored Oct 16, 2023
1 parent ed94a3c commit 0bde9ae
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 4 deletions.
7 changes: 5 additions & 2 deletions src/FeedIo/Explorer.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ public function discover(string $url): array
$stream = $this->client->getResponse($url, new DateTime('@0'));

$internalErrors = libxml_use_internal_errors(true);
$feeds = $this->extractFeeds($stream->getBody());
$feeds = $this->extractFeeds($stream->getBody(), $url);

libxml_use_internal_errors($internalErrors);

return $feeds;
}

protected function extractFeeds(string $html): array
protected function extractFeeds(string $html, string $url = null): array
{
$dom = new DOMDocument();
$dom->loadHTML($html);
Expand All @@ -53,6 +53,9 @@ protected function extractFeeds(string $html): array
// returning
$href = 'https:' . $href;
}
if (!parse_url($href, PHP_URL_HOST) && $url){
$href = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . '/' . ltrim($href,'/');
}
$feeds[] = $href;
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/FeedIo/Feed.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ public function rewind(): void

public function add(ItemInterface $item): FeedInterface
{
$item->setHostInContent($this->getHostFromLink());

if ($item->getLastModified() > $this->getLastModified()) {
$this->setLastModified($item->getLastModified());
}
Expand Down
27 changes: 27 additions & 0 deletions src/FeedIo/Feed/Node.php
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,33 @@ protected function setHost(string $link = null): void
}
}

protected function setHostInContent(string $host = null): void
{
if (property_exists($this, 'content')){
if (!is_null($host) && !is_null($this->content)) {
$this->content = preg_replace('!(<*\s*[^>]*)(href=)(.?)(\/[^\/])!','\1 href=\3'.$host.'\4', $this->content );
$this->content = preg_replace('!(<*\s*[^>]*)(src=)(.?)(\/[^\/])!','\1 src=\3'.$host.'\4', $this->content );
}
}
if (property_exists($this, 'description')){
if (!is_null($host) && !is_null($this->description)) {
$this->description = preg_replace('!(<*\s*[^>]*)(href=)(.?)(\/[^\/])!','\1 href=\3'.$host.'\4', $this->description );
$this->description = preg_replace('!(<*\s*[^>]*)(src=)(.?)(\/[^\/])!','\1 src=\3'.$host.'\4', $this->description );
}
}
}

public function getHostFromLink(): ?string
{
if (!is_null($this->getLink())) {
$partsUrl = parse_url($this->getLink());
$result = $partsUrl['scheme']."://".$partsUrl['host'];
} else
$result = null;

return $result;
}

public function getValue(string $name): ?string
{
foreach ($this->getElementIterator($name) as $element) {
Expand Down
3 changes: 2 additions & 1 deletion src/FeedIo/Parser/XmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ public function parseNode(NodeInterface $item, DOMElement $element, RuleSet $rul
protected function handleNode(NodeInterface $item, DOMElement $node, RuleSet $ruleSet): void
{
if ($this->isItem($node->tagName) && $item instanceof FeedInterface) {
$newItem = $this->parseNode($item->newItem(), $node, $this->getItemRuleSet());
$linkItem = $item->getLink();
$newItem = $this->parseNode($item->newItem()->setLink($linkItem), $node, $this->getItemRuleSet());
$this->addValidItem($item, $newItem);
} else {
$rule = $ruleSet->get($node->tagName);
Expand Down
1 change: 1 addition & 0 deletions src/FeedIo/Reader.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ public function read(string $url, FeedInterface $feed, DateTime $modifiedSince =
try {
$this->logger->info("hitting {$url}");
$response = $this->client->getResponse($url, $modifiedSince);
$feed->setLink($url);
$document = $this->handleResponse($response, $feed);

return new Result($document, $feed, $modifiedSince, $response, $url);
Expand Down
6 changes: 5 additions & 1 deletion src/FeedIo/Rule/Link.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ class Link extends RuleAbstract
*/
public function setProperty(NodeInterface $node, \DOMElement $element): void
{
$node->setLink($element->nodeValue);
$nodeValue = $element->nodeValue;
if (parse_url($nodeValue, PHP_URL_HOST) == null) {
$nodeValue = $node->getHostFromLink(). $nodeValue;
}
$node->setLink($nodeValue);
}

/**
Expand Down

0 comments on commit 0bde9ae

Please sign in to comment.