From b00971b2c391f3ebcd279f3c8213f7303dc16091 Mon Sep 17 00:00:00 2001
From: logmanoriginal
Date: Tue, 11 Dec 2018 17:11:07 +0100
Subject: [PATCH] [simplehtmldom] Update parser to version 1.7
- Update parser to version 1.7
https://sourceforge.net/projects/simplehtmldom/files/simplehtmldom/1.7/
References #959
-------------------- CHANGELOG --------------------
- Added code documentation to improve readability
- Added unit tests for `simple_html_dom::$self_closing_tags`
- Added unit tests for `simple_html_dom::$optional_closing_tags`
- Added unit tests for bug reports
- Added test for bug [#56](https://sourceforge.net/p/simplehtmldom/bugs/56/)
- Added test for bug [#97](https://sourceforge.net/p/simplehtmldom/bugs/97/)
- Added test for bug [#116](https://sourceforge.net/p/simplehtmldom/bugs/116/)
- Added test for bug [#121](https://sourceforge.net/p/simplehtmldom/bugs/127/)
- Added test for bug [#127](https://sourceforge.net/p/simplehtmldom/bugs/127/)
- Added test for bug [#154](https://sourceforge.net/p/simplehtmldom/bugs/154/)
- Added test for bug [#160](https://sourceforge.net/p/simplehtmldom/bugs/160/)
- Added unit tests for memory management of the parser
- Added bit flags to `simple_html_dom::load()`
- Added bit flag `HDOM_SMARTY_AS_TEXT` to optionally filter Smarty scripts (#154)\
**Note**: Smarty scripts are no longer filtered by default!\
- Added build script to automate releases
- Added support for attributes without whitespace to separate them
- Improved documentation and readability for `$self_closing_tags`
- Improved documentation and readability for `$block_tags`
- Improved documentation and readability for `$optional_closing_tags`
- Updated list of `simple_html_dom::$self_closing_tags`
- Removed 'spacer' (obsolete)
- Added 'area'
- Added 'col'
- Added 'meta'
- Added 'param'
- Added 'source'
- Added 'track'
- Added 'wbr'
- Updated list of `simple_html_dom::$optional_closing_tags`
- Removed "nobr" (obsolete)
- Added 'th' as closable element to 'td'
- Added 'td' as closable element to 'th'
- Added 'optgroup' with 'optgroup' and 'option' as closable elements
- Added 'optgroup' as closable element to 'option'
- Added 'rp' with 'rp' and 'rt' as closable elements
- Added 'rt' with 'rt' and 'rp' as closable elements
- Clarified meaning of `simple_html_dom->parent`
- Changed default `$offset` for `file_get_html()` from -1 to 0 (#161)
- Changed `simple_html_dom::load()` to remove script tags before replacing newline characters
- `simple_html_dom_node::text()` no longer adds whitespace to top level span elements (only to sub-elements)
- `simple_html_dom_node::text()` adds blank lines between paragraphs
- Normalized line endings in the repository to LF via `.gitattributes`
- Improved performance of `simple_html_dom::parse_charset()` by approximately 25%
- Improved performance of `simple_html_dom::parse()` by approximately 10%
- `str_get_html()` is deprecated and should be replaced by `new simple_html_dom()`
- Removed protected function `simple_html_dom::copy_until_char_escaped()`
- Fixed compatibility issues with PHP 7.3
- Fixed typo (#147)
- Fixed handling of incorrectly escaped text (#160)
- Restore functionality of `$maxLen` in `file_get_html()`
- Fixed load_file breaks if an error ocurred in another script
---
vendor/simplehtmldom/simple_html_dom.php | 752 ++++++++++++++++++-----
1 file changed, 591 insertions(+), 161 deletions(-)
diff --git a/vendor/simplehtmldom/simple_html_dom.php b/vendor/simplehtmldom/simple_html_dom.php
index b5d308987ba..676807d3856 100644
--- a/vendor/simplehtmldom/simple_html_dom.php
+++ b/vendor/simplehtmldom/simple_html_dom.php
@@ -34,7 +34,7 @@
* @author S.C. Chen
* @author John Schlick
* @author Rus Carroll
- * @version 1.5 ($Rev: 208 $)
+ * @version Rev. 1.7 (214)
* @package PlaceLocalInclude
* @subpackage simple_html_dom
*/
@@ -63,20 +63,27 @@
define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " ");
-define('MAX_FILE_SIZE', 10000000);
+define('MAX_FILE_SIZE', 600000);
+
+/** Contents between curly braces "{" and "}" are interpreted as text */
+define('HDOM_SMARTY_AS_TEXT', 1);
+
// helper functions
// -----------------------------------------------------------------------------
// get html dom from file
// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
-function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+function file_get_html($url, $use_include_path = false, $context=null, $offset = 0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
+ // Ensure maximum length is greater than zero
+ if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
+
// We DO force the tags to be terminated.
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
- // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
- $contents = file_get_contents($url, $use_include_path, $context, $offset);
+ // For sourceforge users: uncomment the next line and comment the retrieve_url_contents line 2 lines down if it is not already done.
+ $contents = file_get_contents($url, $use_include_path, $context, $offset, $maxLen);
// Paperg - use our own mechanism for getting the contents as we want to control the timeout.
//$contents = retrieve_url_contents($url);
- if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
+ if (empty($contents) || strlen($contents) > $maxLen)
{
return false;
}
@@ -114,17 +121,68 @@ function dump_html_tree($node, $show_attr=true, $deep=0)
*/
class simple_html_dom_node
{
+ /**
+ * Node type
+ *
+ * Default is {@see HDOM_TYPE_TEXT}
+ *
+ * @var int
+ */
public $nodetype = HDOM_TYPE_TEXT;
+
+ /**
+ * Tag name
+ *
+ * Default is 'text'
+ *
+ * @var string
+ */
public $tag = 'text';
+
+ /**
+ * List of attributes
+ *
+ * @var array
+ */
public $attr = array();
+
+ /**
+ * List of child node objects
+ *
+ * @var array
+ */
public $children = array();
public $nodes = array();
+
+ /**
+ * The parent node object
+ *
+ * @var object|null
+ */
public $parent = null;
+
// The "info" array - see HDOM_INFO_... for what each element contains.
public $_ = array();
+
+ /**
+ * Start position of the tag in the document
+ *
+ * @var int
+ */
public $tag_start = 0;
+
+ /**
+ * The DOM object
+ *
+ * @var object|null
+ */
private $dom = null;
+ /**
+ * Construct new node object
+ *
+ * Adds itself to the list of DOM Nodes {@see simple_html_dom::$nodes}
+ */
function __construct($dom)
{
$this->dom = $dom;
@@ -240,8 +298,13 @@ function dump_node($echo=true)
}
}
- // returns the parent of node
- // If a node is passed in, it will reset the parent of the current node to that one.
+ /**
+ * Return or set parent node
+ *
+ * @param object|null $parent (optional) The parent node, `null` to return
+ * the current parent node.
+ * @return object|null The parent node
+ */
function parent($parent=null)
{
// I am SURE that this doesn't work properly.
@@ -256,13 +319,22 @@ function parent($parent=null)
return $this->parent;
}
- // verify that node has children
+ /**
+ * @return bool True if the node has at least one child node
+ */
function has_child()
{
return !empty($this->children);
}
- // returns children of node
+ /**
+ * Get child node at specified index
+ *
+ * @param int $idx The index of the child node to return, `-1` to return all
+ * child nodes.
+ * @return object|array|null The child node at the specified index, all child
+ * nodes or null if the index is invalid.
+ */
function children($idx=-1)
{
if ($idx===-1)
@@ -276,7 +348,15 @@ function children($idx=-1)
return null;
}
- // returns the first child of node
+ /**
+ * Get first child node
+ *
+ * @return object|null The first child node or null if the current node has
+ * no child nodes.
+ *
+ * @todo Use `empty()` instead of `count()` to improve performance on large
+ * arrays.
+ */
function first_child()
{
if (count($this->children)>0)
@@ -286,7 +366,14 @@ function first_child()
return null;
}
- // returns the last child of node
+ /**
+ * Get last child node
+ *
+ * @return object|null The last child node or null if the current node has
+ * no child nodes.
+ *
+ * @todo Use `end()` to slightly improve performance on large arrays.
+ */
function last_child()
{
if (($count=count($this->children))>0)
@@ -296,7 +383,12 @@ function last_child()
return null;
}
- // returns the next sibling of node
+ /**
+ * Get next sibling node
+ *
+ * @return object|null The sibling node or null if the current node has no
+ * sibling nodes.
+ */
function next_sibling()
{
if ($this->parent===null)
@@ -317,7 +409,12 @@ function next_sibling()
return $this->parent->children[$idx];
}
- // returns the previous sibling of node
+ /**
+ * Get previous sibling node
+ *
+ * @return object|null The sibling node or null if the current node has no
+ * sibling nodes.
+ */
function prev_sibling()
{
if ($this->parent===null) return null;
@@ -329,7 +426,16 @@ function prev_sibling()
return $this->parent->children[$idx];
}
- // function to locate a specific ancestor tag in the path to the root.
+ /**
+ * Traverse ancestors to the first matching tag.
+ *
+ * @param string $tag Tag to find
+ * @return object|null First matching node in the DOM tree or null if no
+ * match was found.
+ *
+ * @todo Null is returned implicitly by calling ->parent on the root node.
+ * This behaviour could change at any time, rendering this function invalid.
+ */
function find_ancestor_tag($tag)
{
global $debug_object;
@@ -351,7 +457,11 @@ function find_ancestor_tag($tag)
return $returnDom;
}
- // get dom node's inner html
+ /**
+ * Get node's inner text (everything inside the opening and closing tags)
+ *
+ * @return string
+ */
function innertext()
{
if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
@@ -363,7 +473,11 @@ function innertext()
return $ret;
}
- // get dom node's outer text (with tag)
+ /**
+ * Get node's outer text (everything including the opening and closing tags)
+ *
+ * @return string
+ */
function outertext()
{
global $debug_object;
@@ -423,7 +537,11 @@ function outertext()
return $ret;
}
- // get dom node's plain text
+ /**
+ * Get node's plain text (everything excluding all tags)
+ *
+ * @return string
+ */
function text()
{
if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
@@ -444,20 +562,29 @@ function text()
{
foreach ($this->nodes as $n)
{
+ // Start paragraph after a blank line
+ if ($n->tag == 'p')
+ {
+ $ret .= "\n\n";
+ }
+
$ret .= $this->convert_text($n->text());
- }
- // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
- if ($this->tag == "span")
- {
- $ret .= $this->dom->default_span_text;
+ // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
+ if ($n->tag == "span")
+ {
+ $ret .= $this->dom->default_span_text;
+ }
}
-
-
}
- return $ret;
+ return trim($ret);
}
+ /**
+ * Get node's xml text (inner text as a CDATA section)
+ *
+ * @return string
+ */
function xmltext()
{
$ret = $this->innertext();
@@ -686,7 +813,7 @@ protected function parse_selector($selector_string) {
// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
// farther study is required to determine of this should be documented or removed.
// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
- $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+ $pattern = "/([\w:\*-]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w:-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
@@ -986,49 +1113,262 @@ function appendChild($node) {$node->parent($this); return $node;}
*/
class simple_html_dom
{
+ /**
+ * The root node of the document
+ *
+ * @var object
+ */
public $root = null;
+
+ /**
+ * List of nodes in the current DOM
+ *
+ * @var array
+ */
public $nodes = array();
+
+ /**
+ * Callback function to run for each element in the DOM.
+ *
+ * @var callable|null
+ */
public $callback = null;
+
+ /**
+ * Indicates how tags and attributes are matched
+ *
+ * @var bool When set to **true** tags and attributes will be converted to
+ * lowercase before matching.
+ */
public $lowercase = false;
- // Used to keep track of how large the text was when we started.
+
+ /**
+ * Original document size
+ *
+ * Holds the original document size.
+ *
+ * @var int
+ */
public $original_size;
+
+ /**
+ * Current document size
+ *
+ * Holds the current document size. The document size is determined by the
+ * string length of ({@see simple_html_dom::$doc}).
+ *
+ * _Note_: Using this variable is more efficient than calling `strlen($doc)`
+ *
+ * @var int
+ * */
public $size;
+
+ /**
+ * Current position in the document
+ *
+ * @var int
+ */
protected $pos;
+
+ /**
+ * The document
+ *
+ * @var string
+ */
protected $doc;
+
+ /**
+ * Current character
+ *
+ * Holds the current character at position {@see simple_html_dom::$pos} in
+ * the document {@see simple_html_dom::$doc}
+ *
+ * _Note_: Using this variable is more efficient than calling `substr($doc, $pos, 1)`
+ *
+ * @var string
+ */
protected $char;
+
protected $cursor;
+
+ /**
+ * Parent node of the next node detected by the parser
+ *
+ * @var object
+ */
protected $parent;
protected $noise = array();
+
+ /**
+ * Tokens considered blank in HTML
+ *
+ * @var string
+ */
protected $token_blank = " \t\r\n";
+
+ /**
+ * Tokens to identify the equal sign for attributes, stopping either at the
+ * closing tag ("/" i.e. "") or the end of an opening tag (">" i.e.
+ * "")
+ *
+ * @var string
+ */
protected $token_equal = ' =/>';
+
+ /**
+ * Tokens to identify the end of a tag name. A tag name either ends on the
+ * ending slash ("/" i.e. "") or whitespace ("\s\r\n\t")
+ *
+ * @var string
+ */
protected $token_slash = " />\r\n\t";
+
+ /**
+ * Tokens to identify the end of an attribute
+ *
+ * @var string
+ */
protected $token_attr = ' >';
+
// Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
public $_charset = '';
public $_target_charset = '';
+
+ /**
+ * Innertext for
elements
+ *
+ * @var string
+ */
protected $default_br_text = "";
+
+ /**
+ * Suffix for elements
+ *
+ * @var string
+ */
public $default_span_text = "";
- // use isset instead of in_array, performance boost about 30%...
- protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
- protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
- // Known sourceforge issue #2977341
- // B tags that are not closed cause us to return everything to the end of the document.
+ /**
+ * Defines a list of self-closing tags (Void elements) according to the HTML
+ * Specification
+ *
+ * _Remarks_:
+ * - Use `isset()` instead of `in_array()` on array elements to boost
+ * performance about 30%
+ * - Sort elements by name for better readability!
+ *
+ * @link https://www.w3.org/TR/html HTML Specification
+ * @link https://www.w3.org/TR/html/syntax.html#void-elements Void elements
+ */
+ protected $self_closing_tags = array(
+ 'area'=>1,
+ 'base'=>1,
+ 'br'=>1,
+ 'col'=>1,
+ 'embed'=>1,
+ 'hr'=>1,
+ 'img'=>1,
+ 'input'=>1,
+ 'link'=>1,
+ 'meta'=>1,
+ 'param'=>1,
+ 'source'=>1,
+ 'track'=>1,
+ 'wbr'=>1
+ );
+
+ /**
+ * Defines a list of tags which - if closed - close all optional closing
+ * elements within if they haven't been closed yet. (So, an element where
+ * neither opening nor closing tag is omissible consistently closes every
+ * optional closing element within)
+ *
+ * _Remarks_:
+ * - Use `isset()` instead of `in_array()` on array elements to boost
+ * performance about 30%
+ * - Sort elements by name for better readability!
+ */
+ protected $block_tags = array(
+ 'body'=>1,
+ 'div'=>1,
+ 'form'=>1,
+ 'root'=>1,
+ 'span'=>1,
+ 'table'=>1
+ );
+
+ /**
+ * Defines elements whose end tag is omissible.
+ *
+ * * key = Name of an element whose end tag is omissible.
+ * * value = Names of elements whose end tag is omissible, that are closed
+ * by the current element.
+ *
+ * _Remarks_:
+ * - Use `isset()` instead of `in_array()` on array elements to boost
+ * performance about 30%
+ * - Sort elements by name for better readability!
+ *
+ * **Example**
+ *
+ * An `li` element’s end tag may be omitted if the `li` element is immediately
+ * followed by another `li` element. To do that, add following element to the
+ * array:
+ *
+ * ```php
+ * 'li' => array('li'),
+ * ```
+ *
+ * With this, the following two examples are considered equal. Note that the
+ * second example is missing the closing tags on `li` elements.
+ *
+ * ```html
+ *
+ * ```
+ *
+ *
+ *
+ * ```html
+ *
+ * ```
+ *
+ *
+ *
+ * @var array A two-dimensional array where the key is the name of an
+ * element whose end tag is omissible and the value is an array of elements
+ * whose end tag is omissible, that are closed by the current element.
+ *
+ * @link https://www.w3.org/TR/html/syntax.html#optional-tags Optional tags
+ *
+ * @todo The implementation of optional closing tags doesn't work in all cases
+ * because it only consideres elements who close other optional closing
+ * tags, not taking into account that some (non-blocking) tags should close
+ * these optional closing tags. For example, the end tag for "p" is omissible
+ * and can be closed by an "address" element, whose end tag is NOT omissible.
+ * Currently a "p" element without closing tag stops at the next "p" element
+ * or blocking tag, even if it contains other elements.
+ *
+ * @todo Known sourceforge issue #2977341
+ * B tags that are not closed cause us to return everything to the end of
+ * the document.
+ */
protected $optional_closing_tags = array(
- 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
- 'th'=>array('th'=>1),
- 'td'=>array('td'=>1),
- 'li'=>array('li'=>1),
- 'dt'=>array('dt'=>1, 'dd'=>1),
+ 'b'=>array('b'=>1), // Not optional, see https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
'dd'=>array('dd'=>1, 'dt'=>1),
- 'dl'=>array('dd'=>1, 'dt'=>1),
+ 'dl'=>array('dd'=>1, 'dt'=>1), // Not optional, see https://www.w3.org/TR/html/grouping-content.html#the-dl-element
+ 'dt'=>array('dd'=>1, 'dt'=>1),
+ 'li'=>array('li'=>1),
+ 'optgroup'=>array('optgroup'=>1, 'option'=>1),
+ 'option'=>array('optgroup'=>1, 'option'=>1),
'p'=>array('p'=>1),
- 'nobr'=>array('nobr'=>1),
- 'b'=>array('b'=>1),
- 'option'=>array('option'=>1),
+ 'rp'=>array('rp'=>1, 'rt'=>1),
+ 'rt'=>array('rp'=>1, 'rt'=>1),
+ 'td'=>array('td'=>1, 'th'=>1),
+ 'th'=>array('td'=>1, 'th'=>1),
+ 'tr'=>array('td'=>1, 'th'=>1, 'tr'=>1),
);
- function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+ function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
{
if ($str)
{
@@ -1038,7 +1378,7 @@ function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_
}
else
{
- $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
+ $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options);
}
}
// Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
@@ -1054,21 +1394,32 @@ function __destruct()
}
// load html from string
- function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+ function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT, $options=0)
{
global $debug_object;
// prepare
- $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
- // strip out cdata
- $this->remove_noise("''is", true);
- // strip out comments
- $this->remove_noise("''is");
+ $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
+
// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
// Script tags removal now preceeds style tag removal.
// strip out