You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
337 lines
9.2 KiB
337 lines
9.2 KiB
2 years ago
|
<?php
|
||
|
/**
|
||
|
* Copyright 2010-2017 Horde LLC (http://www.horde.org/)
|
||
|
*
|
||
|
* See the enclosed file COPYING for license information (LGPL). If you
|
||
|
* did not receive this file, see http://www.horde.org/licenses/lgpl21.
|
||
|
*
|
||
|
* @category Horde
|
||
|
* @copyright 2010-2017 Horde LLC
|
||
|
* @package Util
|
||
|
* @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* Parse DOM data from HTML strings.
|
||
|
*
|
||
|
* @author Michael Slusarz <slusarz@horde.org>
|
||
|
* @category Horde
|
||
|
* @copyright 2010-2017 Horde LLC
|
||
|
* @package Util
|
||
|
* @license http://www.horde.org/licenses/lgpl21 LGPL 2.1
|
||
|
*/
|
||
|
class Horde_Domhtml implements Iterator
|
||
|
{
|
||
|
/**
|
||
|
* DOM object.
|
||
|
*
|
||
|
* @var DOMDocument
|
||
|
*/
|
||
|
public $dom;
|
||
|
|
||
|
/**
|
||
|
* Iterator status.
|
||
|
*
|
||
|
* @var array
|
||
|
*/
|
||
|
protected $_iterator = null;
|
||
|
|
||
|
/**
|
||
|
* Original charset of data.
|
||
|
*
|
||
|
* @var string
|
||
|
*/
|
||
|
protected $_origCharset;
|
||
|
|
||
|
/**
|
||
|
* Encoding tag added to beginning of output.
|
||
|
*
|
||
|
* @var string
|
||
|
*/
|
||
|
protected $_xmlencoding = '';
|
||
|
|
||
|
/**
|
||
|
* Constructor.
|
||
|
*
|
||
|
* @param string $text The text of the HTML document.
|
||
|
* @param string $charset The charset of the HTML document.
|
||
|
*
|
||
|
* @throws Exception
|
||
|
*/
|
||
|
public function __construct($text, $charset = null)
|
||
|
{
|
||
|
if (!extension_loaded('dom')) {
|
||
|
throw new Exception('DOM extension is not available.');
|
||
|
}
|
||
|
|
||
|
// Bug #9616: Make sure we have valid HTML input.
|
||
|
if (!strlen($text)) {
|
||
|
$text = '<html></html>';
|
||
|
}
|
||
|
|
||
|
$old_error = libxml_use_internal_errors(true);
|
||
|
$this->dom = new DOMDocument();
|
||
|
|
||
|
if (is_null($charset)) {
|
||
|
/* If no charset given, charset is whatever libxml tells us the
|
||
|
* encoding should be defaulting to 'iso-8859-1'. */
|
||
|
$this->_loadHTML($text);
|
||
|
$this->_origCharset = $this->dom->encoding
|
||
|
? $this->dom->encoding
|
||
|
: 'iso-8859-1';
|
||
|
} else {
|
||
|
/* Convert/try with UTF-8 first. */
|
||
|
$this->_origCharset = Horde_String::lower($charset);
|
||
|
$this->_xmlencoding = '<?xml encoding="UTF-8"?>';
|
||
|
$this->_loadHTML(
|
||
|
$this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8')
|
||
|
);
|
||
|
|
||
|
if ($this->dom->encoding &&
|
||
|
(Horde_String::lower($this->dom->encoding) != 'utf-8')) {
|
||
|
/* Convert charset to what the HTML document says it SHOULD
|
||
|
* be. */
|
||
|
$this->_loadHTML(
|
||
|
Horde_String::convertCharset($text, $charset, $this->dom->encoding)
|
||
|
);
|
||
|
$this->_xmlencoding = '';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($old_error) {
|
||
|
libxml_use_internal_errors(false);
|
||
|
}
|
||
|
|
||
|
/* Sanity checking: make sure we have the documentElement object. */
|
||
|
if (!$this->dom->documentElement) {
|
||
|
$this->dom->appendChild($this->dom->createElement('html'));
|
||
|
}
|
||
|
|
||
|
/* Remove old charset information. */
|
||
|
$xpath = new DOMXPath($this->dom);
|
||
|
$domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]');
|
||
|
for ($i = $domlist->length; $i > 0; --$i) {
|
||
|
$meta = $domlist->item($i - 1);
|
||
|
$meta->parentNode->removeChild($meta);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the HEAD element, or creates one if it doesn't exist.
|
||
|
*
|
||
|
* @return DOMElement HEAD element.
|
||
|
*/
|
||
|
public function getHead()
|
||
|
{
|
||
|
$head = $this->dom->getElementsByTagName('head');
|
||
|
if ($head->length) {
|
||
|
return $head->item(0);
|
||
|
}
|
||
|
|
||
|
$headelt = $this->dom->createElement('head');
|
||
|
$this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild);
|
||
|
|
||
|
return $headelt;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the BODY element, or creates one if it doesn't exist.
|
||
|
*
|
||
|
* @since 2.2.0
|
||
|
*
|
||
|
* @return DOMElement BODY element.
|
||
|
*/
|
||
|
public function getBody()
|
||
|
{
|
||
|
$body = $this->dom->getElementsByTagName('body');
|
||
|
if ($body->length) {
|
||
|
return $body->item(0);
|
||
|
}
|
||
|
|
||
|
$bodyelt = $this->dom->createElement('body');
|
||
|
$this->dom->documentElement->appendChild($bodyelt);
|
||
|
|
||
|
return $bodyelt;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the full HTML text in the original charset.
|
||
|
*
|
||
|
* @param array $opts Additional options: (since 2.1.0)
|
||
|
* - charset: (string) Return using this charset. If set but empty, will
|
||
|
* return as currently stored in the DOM object.
|
||
|
* - metacharset: (boolean) If true, will add a META tag containing the
|
||
|
* charset information.
|
||
|
*
|
||
|
* @return string HTML text.
|
||
|
*/
|
||
|
public function returnHtml(array $opts = array())
|
||
|
{
|
||
|
$curr_charset = $this->getCharset();
|
||
|
if (strcasecmp($curr_charset, 'US-ASCII') === 0) {
|
||
|
$curr_charset = 'UTF-8';
|
||
|
}
|
||
|
$charset = array_key_exists('charset', $opts)
|
||
|
? (empty($opts['charset']) ? $curr_charset : $opts['charset'])
|
||
|
: $this->_origCharset;
|
||
|
|
||
|
if (empty($opts['metacharset'])) {
|
||
|
$text = $this->dom->saveHTML();
|
||
|
} else {
|
||
|
/* Add placeholder for META tag. Can't add charset yet because DOM
|
||
|
* extension will alter output if it exists. */
|
||
|
$meta = $this->dom->createElement('meta');
|
||
|
$meta->setAttribute('http-equiv', 'content-type');
|
||
|
$meta->setAttribute('horde_dom_html_charset', '');
|
||
|
|
||
|
$head = $this->getHead();
|
||
|
$head->insertBefore($meta, $head->firstChild);
|
||
|
|
||
|
$text = str_replace(
|
||
|
'horde_dom_html_charset=""',
|
||
|
'content="text/html; charset=' . $charset . '"',
|
||
|
$this->dom->saveHTML()
|
||
|
);
|
||
|
|
||
|
$head->removeChild($meta);
|
||
|
}
|
||
|
|
||
|
if (strcasecmp($curr_charset, $charset) !== 0) {
|
||
|
$text = Horde_String::convertCharset($text, $curr_charset, $charset);
|
||
|
}
|
||
|
|
||
|
if (!$this->_xmlencoding ||
|
||
|
(($pos = strpos($text, $this->_xmlencoding)) === false)) {
|
||
|
return $text;
|
||
|
}
|
||
|
|
||
|
return substr_replace($text, '', $pos, strlen($this->_xmlencoding));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the body text in the original charset.
|
||
|
*
|
||
|
* @return string HTML text.
|
||
|
*/
|
||
|
public function returnBody()
|
||
|
{
|
||
|
$body = $this->getBody();
|
||
|
$text = '';
|
||
|
|
||
|
if ($body->hasChildNodes()) {
|
||
|
foreach ($body->childNodes as $child) {
|
||
|
$text .= $this->dom->saveXML($child);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the charset of the DOM data.
|
||
|
*
|
||
|
* @since 2.1.0
|
||
|
*
|
||
|
* @return string Charset of DOM data.
|
||
|
*/
|
||
|
public function getCharset()
|
||
|
{
|
||
|
return $this->dom->encoding
|
||
|
? $this->dom->encoding
|
||
|
: ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Loads the HTML data.
|
||
|
*
|
||
|
* @param string $html HTML data.
|
||
|
*/
|
||
|
protected function _loadHTML($html)
|
||
|
{
|
||
|
if (version_compare(PHP_VERSION, '5.4', '>=')) {
|
||
|
$mask = defined('LIBXML_PARSEHUGE')
|
||
|
? LIBXML_PARSEHUGE
|
||
|
: 0;
|
||
|
$mask |= defined('LIBXML_COMPACT')
|
||
|
? LIBXML_COMPACT
|
||
|
: 0;
|
||
|
$this->dom->loadHTML($html, $mask);
|
||
|
} else {
|
||
|
$this->dom->loadHTML($html);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Iterator methods. */
|
||
|
|
||
|
/**
|
||
|
*/
|
||
|
public function current()
|
||
|
{
|
||
|
if ($this->_iterator instanceof DOMDocument) {
|
||
|
return $this->_iterator;
|
||
|
}
|
||
|
|
||
|
$curr = end($this->_iterator);
|
||
|
return $curr['list']->item($curr['i']);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*/
|
||
|
public function key()
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*/
|
||
|
public function next()
|
||
|
{
|
||
|
/* Iterate in the reverse direction through the node list. This allows
|
||
|
* alteration of the original list without breaking things (foreach()
|
||
|
* w/removeChild() may exit iteration after removal is complete. */
|
||
|
|
||
|
if ($this->_iterator instanceof DOMDocument) {
|
||
|
$this->_iterator = array();
|
||
|
$curr = array();
|
||
|
$node = $this->dom;
|
||
|
} elseif (empty($this->_iterator)) {
|
||
|
$this->_iterator = null;
|
||
|
return;
|
||
|
} else {
|
||
|
$curr = &$this->_iterator[count($this->_iterator) - 1];
|
||
|
$node = $curr['list']->item($curr['i']);
|
||
|
}
|
||
|
|
||
|
if (empty($curr['child']) &&
|
||
|
($node instanceof DOMNode) &&
|
||
|
$node->hasChildNodes()) {
|
||
|
$curr['child'] = true;
|
||
|
$this->_iterator[] = array(
|
||
|
'child' => false,
|
||
|
'i' => $node->childNodes->length - 1,
|
||
|
'list' => $node->childNodes
|
||
|
);
|
||
|
} elseif (--$curr['i'] < 0) {
|
||
|
array_pop($this->_iterator);
|
||
|
$this->next();
|
||
|
} else {
|
||
|
$curr['child'] = false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*/
|
||
|
public function rewind()
|
||
|
{
|
||
|
$this->_iterator = $this->dom;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*/
|
||
|
public function valid()
|
||
|
{
|
||
|
return !is_null($this->_iterator);
|
||
|
}
|
||
|
|
||
|
}
|