Skip to content

Commit

Permalink
Removed magical option array
Browse files Browse the repository at this point in the history
  • Loading branch information
paquettg committed Jul 15, 2020
1 parent e37e8ef commit b58c6da
Show file tree
Hide file tree
Showing 16 changed files with 273 additions and 386 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Cleaned up the selector logic.
- Fixed issue with greedy regex for charset detection.
- Fixed bug causing infinite loops in some cases.
- Refactored the way we handle options. Removed the magical option array.

### Removed
- Curl interface and curl implementation has been removed.
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
"paquettg/string-encode": "~1.0.0",
"php-http/httplug": "^2.1",
"php-http/guzzle6-adapter": "^2.0",
"guzzlehttp/psr7": "^1.6"
"guzzlehttp/psr7": "^1.6",
"myclabs/php-enum": "^1.7"
},
"require-dev": {
"phpunit/phpunit": "^7.5.1",
Expand Down
22 changes: 9 additions & 13 deletions src/PHPHtmlParser/Content.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace PHPHtmlParser;

use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\LogicalException;

Expand Down Expand Up @@ -75,11 +76,12 @@ public function char(?int $char = null): string
* Moves the current position forward.
*
* @chainable
*
* @throws ContentLengthException
*/
public function fastForward(int $count): Content
{
if (!$this->canFastForward()) {
if (!$this->canFastForward($count)) {
// trying to go over the content length, throw exception
throw new ContentLengthException('Attempt to fastForward pass the length of the content.');
}
Expand All @@ -91,9 +93,9 @@ public function fastForward(int $count): Content
/**
* Checks if we can move the position forward.
*/
public function canFastForward(): bool
public function canFastForward(int $count): bool
{
return \strlen($this->content) > $this->pos;
return \strlen($this->content) >= $this->pos + $count;
}

/**
Expand Down Expand Up @@ -175,8 +177,6 @@ public function copyUntil(string $string, bool $char = false, bool $escape = fal
/**
* Copies the content until the string is found and return it
* unless the 'unless' is found in the substring.
*
* @return string
*/
public function copyUntilUnless(string $string, string $unless): string
{
Expand All @@ -197,13 +197,11 @@ public function copyUntilUnless(string $string, string $unless): string
/**
* Copies the content until it reaches the token string.,.
*
* @return string
*
* @uses $this->copyUntil()
*/
public function copyByToken(string $token, bool $char = false, bool $escape = false)
public function copyByToken(StringToken $stringToken, bool $char = false, bool $escape = false): string
{
$string = $this->$token;
$string = $stringToken->getValue();

return $this->copyUntil($string, $char, $escape);
}
Expand Down Expand Up @@ -236,13 +234,11 @@ public function skip(string $string, bool $copy = false): string
/**
* Skip a given token of pre-defined characters.
*
* @return Content|string
*
* @uses $this->skip()
*/
public function skipByToken(string $token, bool $copy = false)
public function skipByToken(StringToken $skipToken, bool $copy = false): string
{
$string = $this->$token;
$string = $skipToken->getValue();

return $this->skip($string, $copy);
}
Expand Down
86 changes: 40 additions & 46 deletions src/PHPHtmlParser/Dom.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
use PHPHtmlParser\Dom\Collection;
use PHPHtmlParser\Dom\HtmlNode;
use PHPHtmlParser\Dom\TextNode;
use PHPHtmlParser\Enum\StringToken;
use PHPHtmlParser\Exceptions\ChildNotFoundException;
use PHPHtmlParser\Exceptions\CircularException;
use PHPHtmlParser\Exceptions\ContentLengthException;
use PHPHtmlParser\Exceptions\CurlException;
use PHPHtmlParser\Exceptions\LogicalException;
use PHPHtmlParser\Exceptions\NotLoadedException;
use PHPHtmlParser\Exceptions\StrictException;
Expand Down Expand Up @@ -72,9 +72,9 @@ class Dom
/**
* A global options array to be used by all load calls.
*
* @var array
* @var ?Options
*/
private $globalOptions = [];
private $globalOptions;

/**
* A persistent option object to be used for all options in the
Expand Down Expand Up @@ -147,7 +147,7 @@ public function __get($name)
* @throws StrictException
* @throws LogicalException
*/
public function loadFromFile(string $file, array $options = []): Dom
public function loadFromFile(string $file, ?Options $options = null): Dom
{
$content = @\file_get_contents($file);
if ($content === false) {
Expand All @@ -168,7 +168,7 @@ public function loadFromFile(string $file, array $options = []): Dom
* @throws StrictException
* @throws \Psr\Http\Client\ClientExceptionInterface
*/
public function loadFromUrl(string $url, array $options = [], ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
public function loadFromUrl(string $url, ?Options $options, ?ClientInterface $client = null, ?RequestInterface $request = null): Dom
{
if ($client === null) {
$client = new Client();
Expand All @@ -191,11 +191,15 @@ public function loadFromUrl(string $url, array $options = [], ?ClientInterface $
* @throws CircularException
* @throws StrictException
*/
public function loadStr(string $str, array $option = []): Dom
public function loadStr(string $str, ?Options $options = null): Dom
{
$this->options = new Options();
$this->options->setOptions($this->globalOptions)
->setOptions($option);
if ($this->globalOptions !== null) {
$this->options->setFromOptions($this->globalOptions);
}
if ($options !== null) {
$this->options->setFromOptions($options);
}

$this->rawSize = \strlen($str);
$this->raw = $str;
Expand All @@ -216,7 +220,7 @@ public function loadStr(string $str, array $option = []): Dom
*
* @chainable
*/
public function setOptions(array $options): Dom
public function setOptions(Options $options): Dom
{
$this->globalOptions = $options;

Expand All @@ -235,9 +239,7 @@ public function find(string $selector, int $nth = null)
{
$this->isLoaded();

$result = $this->root->find($selector, $nth);

return $result;
return $this->root->find($selector, $nth);
}

/**
Expand Down Expand Up @@ -463,7 +465,7 @@ private function isLoaded(): void
*/
private function clean(string $str): string
{
if ($this->options->get('cleanupInput') != true) {
if ($this->options->isCleanupInput() != true) {
// skip entire cleanup step
return $str;
}
Expand All @@ -488,7 +490,7 @@ private function clean(string $str): string

// clean out the \n\r
$replace = ' ';
if ($this->options->get('preserveLineBreaks')) {
if ($this->options->isPreserveLineBreaks()) {
$replace = '
';
}
$str = \str_replace(["\r\n", "\r", "\n"], $replace, $str);
Expand All @@ -515,7 +517,7 @@ private function clean(string $str): string
}

// strip out <script> tags
if ($this->options->get('removeScripts')) {
if ($this->options->isRemoveScripts()) {
$str = \mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove scripts 1.');
Expand All @@ -527,7 +529,7 @@ private function clean(string $str): string
}

// strip out <style> tags
if ($this->options->get('removeStyles')) {
if ($this->options->isRemoveStyles()) {
$str = \mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out style tags 1.');
Expand All @@ -538,16 +540,8 @@ private function clean(string $str): string
}
}

// strip out server side scripts
if ($this->options->get('serverSideScripts')) {
$str = \mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to strip out service side scripts.');
}
}

// strip smarty scripts
if ($this->options->get('removeSmartyScripts')) {
if ($this->options->isRemoveSmartyScripts()) {
$str = \mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
if ($str === false) {
throw new LogicalException('mb_eregi_replace returned false instead of a string. Error when attempting to remove smarty scripts.');
Expand All @@ -569,11 +563,11 @@ private function parse(): void
{
// add the root node
$this->root = new HtmlNode('root');
$this->root->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
$this->root->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode());
$activeNode = $this->root;
while ($activeNode !== null) {
if ($activeNode && $activeNode->tag->name() === 'script'
&& $this->options->get('cleanupInput') != true
&& $this->options->isCleanupInput() != true
) {
$str = $this->content->copyUntil('</');
} else {
Expand Down Expand Up @@ -618,12 +612,12 @@ private function parse(): void
if (!$node->getTag()->isSelfClosing()) {
$activeNode = $node;
}
} elseif ($this->options->whitespaceTextNode ||
} elseif ($this->options->isWhitespaceTextNode() ||
\trim($str) != ''
) {
// we found text we care about
$textNode = new TextNode($str, $this->options->removeDoubleSpace);
$textNode->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
$textNode = new TextNode($str, $this->options->isRemoveDoubleSpace());
$textNode->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode());
$activeNode->addChild($textNode);
}
}
Expand Down Expand Up @@ -656,7 +650,7 @@ private function parseTag(): array
if ($this->content->char() == '/') {
// end tag
$tag = $this->content->fastForward(1)
->copyByToken('slash', true);
->copyByToken(StringToken::SLASH(), true);
// move to end of tag
$this->content->copyUntil('>');
$this->content->fastForward(1);
Expand All @@ -675,20 +669,20 @@ private function parseTag(): array
return $return;
}

$tag = \strtolower($this->content->copyByToken('slash', true));
$tag = \strtolower($this->content->copyByToken(StringToken::SLASH(), true));
if (\trim($tag) == '') {
// no tag found, invalid < found
return $return;
}
$node = new HtmlNode($tag);
$node->setHtmlSpecialCharsDecode($this->options->htmlSpecialCharsDecode);
$node->setHtmlSpecialCharsDecode($this->options->isHtmlSpecialCharsDecode());

// attributes
while (
$this->content->char() != '>' &&
$this->content->char() != '/'
) {
$space = $this->content->skipByToken('blank', true);
$space = $this->content->skipByToken(StringToken::BLANK(), true);
if (empty($space)) {
try {
$this->content->fastForward(1);
Expand All @@ -699,28 +693,28 @@ private function parseTag(): array
continue;
}

$name = $this->content->copyByToken('equal', true);
$name = $this->content->copyByToken(StringToken::EQUAL(), true);
if ($name == '/') {
break;
}

if (empty($name)) {
$this->content->skipByToken('blank');
$this->content->skipByToken(StringToken::BLANK());
continue;
}

$this->content->skipByToken('blank');
$this->content->skipByToken(StringToken::BLANK());
if ($this->content->char() == '=') {
$this->content->fastForward(1)
->skipByToken('blank');
->skipByToken(StringToken::BLANK());
switch ($this->content->char()) {
case '"':
$this->content->fastForward(1);
$string = $this->content->copyUntil('"', true);
do {
$moreString = $this->content->copyUntilUnless('"', '=>');
$string .= $moreString;
} while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
} while (\strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
$attr['value'] = $string;
$this->content->fastForward(1);
$node->getTag()->setAttribute($name, $string);
Expand All @@ -731,18 +725,18 @@ private function parseTag(): array
do {
$moreString = $this->content->copyUntilUnless("'", '=>');
$string .= $moreString;
} while (strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
} while (\strlen($moreString) > 0 && $this->content->getPosition() < $this->size);
$attr['value'] = $string;
$this->content->fastForward(1);
$node->getTag()->setAttribute($name, $string, false);
break;
default:
$node->getTag()->setAttribute($name, $this->content->copyByToken('attr', true));
$node->getTag()->setAttribute($name, $this->content->copyByToken(StringToken::ATTR(), true));
break;
}
} else {
// no value attribute
if ($this->options->strict) {
if ($this->options->isStrict()) {
// can't have this in strict html
$character = $this->content->getPosition();
throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
Expand All @@ -754,15 +748,15 @@ private function parseTag(): array
}
}

$this->content->skipByToken('blank');
$this->content->skipByToken(StringToken::BLANK());
$tag = \strtolower($tag);
if ($this->content->char() == '/') {
// self closing tag
$node->getTag()->selfClosing();
$this->content->fastForward(1);
} elseif (\in_array($tag, $this->selfClosing, true)) {
// Should be a self closing tag, check if we are strict
if ($this->options->strict) {
if ($this->options->isStrict()) {
$character = $this->content->getPosition();
throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
}
Expand All @@ -776,7 +770,7 @@ private function parseTag(): array
}
}

if ($this->content->canFastForward()) {
if ($this->content->canFastForward(1)) {
$this->content->fastForward(1);
}

Expand All @@ -798,7 +792,7 @@ private function detectCharset(): bool
$encode->from($this->defaultCharset);
$encode->to($this->defaultCharset);

$enforceEncoding = $this->options->enforceEncoding;
$enforceEncoding = $this->options->getEnforceEncoding();
if ($enforceEncoding !== null) {
// they want to enforce the given encoding
$encode->from($enforceEncoding);
Expand Down
Loading

0 comments on commit b58c6da

Please sign in to comment.