You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

2356 lines
58 KiB

  1. <?php
  2. /**
  3. * Website: http://sourceforge.net/projects/simplehtmldom/
  4. * Additional projects: http://sourceforge.net/projects/debugobject/
  5. * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
  6. *
  7. * Licensed under The MIT License
  8. * See the LICENSE file in the project root for more information.
  9. *
  10. * Authors:
  11. * S.C. Chen
  12. * John Schlick
  13. * Rus Carroll
  14. * logmanoriginal
  15. *
  16. * Contributors:
  17. * Yousuke Kumakura
  18. * Vadim Voituk
  19. * Antcs
  20. *
  21. * Version Rev. 1.9 (290)
  22. */
  23. if (strpos(@ini_get('disable_functions'), 'set_time_limit') === false) {
  24. @set_time_limit(0);
  25. }
  26. ini_set('max_execution_time', 0);
  27. define('HDOM_TYPE_ELEMENT', 1);
  28. define('HDOM_TYPE_COMMENT', 2);
  29. define('HDOM_TYPE_TEXT', 3);
  30. define('HDOM_TYPE_ENDTAG', 4);
  31. define('HDOM_TYPE_ROOT', 5);
  32. define('HDOM_TYPE_UNKNOWN', 6);
  33. define('HDOM_QUOTE_DOUBLE', 0);
  34. define('HDOM_QUOTE_SINGLE', 1);
  35. define('HDOM_QUOTE_NO', 3);
  36. define('HDOM_INFO_BEGIN', 0);
  37. define('HDOM_INFO_END', 1);
  38. define('HDOM_INFO_QUOTE', 2);
  39. define('HDOM_INFO_SPACE', 3);
  40. define('HDOM_INFO_TEXT', 4);
  41. define('HDOM_INFO_INNER', 5);
  42. define('HDOM_INFO_OUTER', 6);
  43. define('HDOM_INFO_ENDSPACE', 7);
  44. defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  45. defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
  46. defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
  47. defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
  48. define('HDOM_SMARTY_AS_TEXT', 1);
  49. function file_get_html(
  50. $url,
  51. $use_include_path = false,
  52. $context = null,
  53. $offset = 0,
  54. $maxLen = -1,
  55. $lowercase = true,
  56. $forceTagsClosed = true,
  57. $target_charset = DEFAULT_TARGET_CHARSET,
  58. $stripRN = true,
  59. $defaultBRText = DEFAULT_BR_TEXT,
  60. $defaultSpanText = DEFAULT_SPAN_TEXT)
  61. {
  62. if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
  63. $dom = new simple_html_dom(
  64. null,
  65. $lowercase,
  66. $forceTagsClosed,
  67. $target_charset,
  68. $stripRN,
  69. $defaultBRText,
  70. $defaultSpanText
  71. );
  72. /**
  73. * For sourceforge users: uncomment the next line and comment the
  74. * retrieve_url_contents line 2 lines down if it is not already done.
  75. */
  76. $contents = file_get_contents(
  77. $url,
  78. $use_include_path,
  79. $context,
  80. $offset,
  81. $maxLen
  82. );
  83. // $contents = retrieve_url_contents($url);
  84. if (empty($contents) || strlen($contents) > $maxLen) {
  85. $dom->clear();
  86. return false;
  87. }
  88. return $dom->load($contents, $lowercase, $stripRN);
  89. }
  90. function str_get_html(
  91. $str,
  92. $lowercase = true,
  93. $forceTagsClosed = true,
  94. $target_charset = DEFAULT_TARGET_CHARSET,
  95. $stripRN = true,
  96. $defaultBRText = DEFAULT_BR_TEXT,
  97. $defaultSpanText = DEFAULT_SPAN_TEXT)
  98. {
  99. $dom = new simple_html_dom(
  100. null,
  101. $lowercase,
  102. $forceTagsClosed,
  103. $target_charset,
  104. $stripRN,
  105. $defaultBRText,
  106. $defaultSpanText
  107. );
  108. if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
  109. $dom->clear();
  110. return false;
  111. }
  112. return $dom->load($str, $lowercase, $stripRN);
  113. }
  114. function dump_html_tree($node, $show_attr = true, $deep = 0)
  115. {
  116. $node->dump($node);
  117. }
  118. class simple_html_dom_node
  119. {
  120. public $nodetype = HDOM_TYPE_TEXT;
  121. public $tag = 'text';
  122. public $attr = array();
  123. public $children = array();
  124. public $nodes = array();
  125. public $parent = null;
  126. public $_ = array();
  127. public $tag_start = 0;
  128. private $dom = null;
  129. function __construct($dom)
  130. {
  131. $this->dom = $dom;
  132. $dom->nodes[] = $this;
  133. }
  134. function __destruct()
  135. {
  136. $this->clear();
  137. }
  138. function __toString()
  139. {
  140. return $this->outertext();
  141. }
  142. function clear()
  143. {
  144. $this->dom = null;
  145. $this->nodes = null;
  146. $this->parent = null;
  147. $this->children = null;
  148. }
  149. function dump($show_attr = true, $depth = 0)
  150. {
  151. echo str_repeat("\t", $depth) . $this->tag;
  152. if ($show_attr && count($this->attr) > 0) {
  153. echo '(';
  154. foreach ($this->attr as $k => $v) {
  155. echo "[$k]=>\"$v\", ";
  156. }
  157. echo ')';
  158. }
  159. echo "\n";
  160. if ($this->nodes) {
  161. foreach ($this->nodes as $node) {
  162. $node->dump($show_attr, $depth + 1);
  163. }
  164. }
  165. }
  166. function dump_node($echo = true)
  167. {
  168. $string = $this->tag;
  169. if (count($this->attr) > 0) {
  170. $string .= '(';
  171. foreach ($this->attr as $k => $v) {
  172. $string .= "[$k]=>\"$v\", ";
  173. }
  174. $string .= ')';
  175. }
  176. if (count($this->_) > 0) {
  177. $string .= ' $_ (';
  178. foreach ($this->_ as $k => $v) {
  179. if (is_array($v)) {
  180. $string .= "[$k]=>(";
  181. foreach ($v as $k2 => $v2) {
  182. $string .= "[$k2]=>\"$v2\", ";
  183. }
  184. $string .= ')';
  185. } else {
  186. $string .= "[$k]=>\"$v\", ";
  187. }
  188. }
  189. $string .= ')';
  190. }
  191. if (isset($this->text)) {
  192. $string .= " text: ({$this->text})";
  193. }
  194. $string .= ' HDOM_INNER_INFO: ';
  195. if (isset($node->_[HDOM_INFO_INNER])) {
  196. $string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
  197. } else {
  198. $string .= ' NULL ';
  199. }
  200. $string .= ' children: ' . count($this->children);
  201. $string .= ' nodes: ' . count($this->nodes);
  202. $string .= ' tag_start: ' . $this->tag_start;
  203. $string .= "\n";
  204. if ($echo) {
  205. echo $string;
  206. return;
  207. } else {
  208. return $string;
  209. }
  210. }
  211. function parent($parent = null)
  212. {
  213. // I am SURE that this doesn't work properly.
  214. // It fails to unset the current node from it's current parents nodes or
  215. // children list first.
  216. if ($parent !== null) {
  217. $this->parent = $parent;
  218. $this->parent->nodes[] = $this;
  219. $this->parent->children[] = $this;
  220. }
  221. return $this->parent;
  222. }
  223. function has_child()
  224. {
  225. return !empty($this->children);
  226. }
  227. function children($idx = -1)
  228. {
  229. if ($idx === -1) {
  230. return $this->children;
  231. }
  232. if (isset($this->children[$idx])) {
  233. return $this->children[$idx];
  234. }
  235. return null;
  236. }
  237. function first_child()
  238. {
  239. if (count($this->children) > 0) {
  240. return $this->children[0];
  241. }
  242. return null;
  243. }
  244. function last_child()
  245. {
  246. if (count($this->children) > 0) {
  247. return end($this->children);
  248. }
  249. return null;
  250. }
  251. function next_sibling()
  252. {
  253. if ($this->parent === null) {
  254. return null;
  255. }
  256. $idx = array_search($this, $this->parent->children, true);
  257. if ($idx !== false && isset($this->parent->children[$idx + 1])) {
  258. return $this->parent->children[$idx + 1];
  259. }
  260. return null;
  261. }
  262. function prev_sibling()
  263. {
  264. if ($this->parent === null) {
  265. return null;
  266. }
  267. $idx = array_search($this, $this->parent->children, true);
  268. if ($idx !== false && $idx > 0) {
  269. return $this->parent->children[$idx - 1];
  270. }
  271. return null;
  272. }
  273. function find_ancestor_tag($tag)
  274. {
  275. global $debug_object;
  276. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  277. if ($this->parent === null) {
  278. return null;
  279. }
  280. $ancestor = $this->parent;
  281. while (!is_null($ancestor)) {
  282. if (is_object($debug_object)) {
  283. $debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
  284. }
  285. if ($ancestor->tag === $tag) {
  286. break;
  287. }
  288. $ancestor = $ancestor->parent;
  289. }
  290. return $ancestor;
  291. }
  292. function innertext()
  293. {
  294. if (isset($this->_[HDOM_INFO_INNER])) {
  295. return $this->_[HDOM_INFO_INNER];
  296. }
  297. if (isset($this->_[HDOM_INFO_TEXT])) {
  298. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  299. }
  300. $ret = '';
  301. foreach ($this->nodes as $n) {
  302. $ret .= $n->outertext();
  303. }
  304. return $ret;
  305. }
  306. function outertext()
  307. {
  308. global $debug_object;
  309. if (is_object($debug_object)) {
  310. $text = '';
  311. if ($this->tag === 'text') {
  312. if (!empty($this->text)) {
  313. $text = ' with text: ' . $this->text;
  314. }
  315. }
  316. $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
  317. }
  318. if ($this->tag === 'root') {
  319. return $this->innertext();
  320. }
  321. // todo: What is the use of this callback? Remove?
  322. if ($this->dom && $this->dom->callback !== null) {
  323. call_user_func_array($this->dom->callback, array($this));
  324. }
  325. if (isset($this->_[HDOM_INFO_OUTER])) {
  326. return $this->_[HDOM_INFO_OUTER];
  327. }
  328. if (isset($this->_[HDOM_INFO_TEXT])) {
  329. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  330. }
  331. $ret = '';
  332. if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
  333. $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
  334. }
  335. if (isset($this->_[HDOM_INFO_INNER])) {
  336. // todo: <br> should either never have HDOM_INFO_INNER or always
  337. if ($this->tag !== 'br') {
  338. $ret .= $this->_[HDOM_INFO_INNER];
  339. }
  340. } elseif ($this->nodes) {
  341. foreach ($this->nodes as $n) {
  342. $ret .= $this->convert_text($n->outertext());
  343. }
  344. }
  345. if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
  346. $ret .= '</' . $this->tag . '>';
  347. }
  348. return $ret;
  349. }
  350. function text()
  351. {
  352. if (isset($this->_[HDOM_INFO_INNER])) {
  353. return $this->_[HDOM_INFO_INNER];
  354. }
  355. switch ($this->nodetype) {
  356. case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  357. case HDOM_TYPE_COMMENT: return '';
  358. case HDOM_TYPE_UNKNOWN: return '';
  359. }
  360. if (strcasecmp($this->tag, 'script') === 0) { return ''; }
  361. if (strcasecmp($this->tag, 'style') === 0) { return ''; }
  362. $ret = '';
  363. // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
  364. // for some span tags, and some p tags) $this->nodes is set to NULL.
  365. // NOTE: This indicates that there is a problem where it's set to NULL
  366. // without a clear happening.
  367. // WHY is this happening?
  368. if (!is_null($this->nodes)) {
  369. foreach ($this->nodes as $n) {
  370. // Start paragraph after a blank line
  371. if ($n->tag === 'p') {
  372. $ret = trim($ret) . "\n\n";
  373. }
  374. $ret .= $this->convert_text($n->text());
  375. // If this node is a span... add a space at the end of it so
  376. // multiple spans don't run into each other. This is plaintext
  377. // after all.
  378. if ($n->tag === 'span') {
  379. $ret .= $this->dom->default_span_text;
  380. }
  381. }
  382. }
  383. return $ret;
  384. }
  385. function xmltext()
  386. {
  387. $ret = $this->innertext();
  388. $ret = str_ireplace('<![CDATA[', '', $ret);
  389. $ret = str_replace(']]>', '', $ret);
  390. return $ret;
  391. }
  392. function makeup()
  393. {
  394. // text, comment, unknown
  395. if (isset($this->_[HDOM_INFO_TEXT])) {
  396. return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
  397. }
  398. $ret = '<' . $this->tag;
  399. $i = -1;
  400. foreach ($this->attr as $key => $val) {
  401. ++$i;
  402. // skip removed attribute
  403. if ($val === null || $val === false) { continue; }
  404. $ret .= @$this->_[HDOM_INFO_SPACE][$i][0];
  405. //no value attr: nowrap, checked selected...
  406. if ($val === true) {
  407. $ret .= $key;
  408. } else {
  409. switch (@$this->_[HDOM_INFO_QUOTE][$i])
  410. {
  411. case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
  412. case HDOM_QUOTE_SINGLE: $quote = '\''; break;
  413. default: $quote = '';
  414. }
  415. $ret .= $key
  416. . @$this->_[HDOM_INFO_SPACE][$i][1]
  417. . '='
  418. . @$this->_[HDOM_INFO_SPACE][$i][2]
  419. . $quote
  420. . $val
  421. . $quote;
  422. }
  423. }
  424. $ret = $this->dom->restore_noise($ret);
  425. return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
  426. }
  427. function find($selector, $idx = null, $lowercase = false)
  428. {
  429. $selectors = $this->parse_selector($selector);
  430. if (($count = count($selectors)) === 0) { return array(); }
  431. $found_keys = array();
  432. // find each selector
  433. for ($c = 0; $c < $count; ++$c) {
  434. // The change on the below line was documented on the sourceforge
  435. // code tracker id 2788009
  436. // used to be: if (($levle=count($selectors[0]))===0) return array();
  437. if (($levle = count($selectors[$c])) === 0) { return array(); }
  438. if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
  439. $head = array($this->_[HDOM_INFO_BEGIN] => 1);
  440. $cmd = ' '; // Combinator
  441. // handle descendant selectors, no recursive!
  442. for ($l = 0; $l < $levle; ++$l) {
  443. $ret = array();
  444. foreach ($head as $k => $v) {
  445. $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
  446. //PaperG - Pass this optional parameter on to the seek function.
  447. $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
  448. }
  449. $head = $ret;
  450. $cmd = $selectors[$c][$l][4]; // Next Combinator
  451. }
  452. foreach ($head as $k => $v) {
  453. if (!isset($found_keys[$k])) {
  454. $found_keys[$k] = 1;
  455. }
  456. }
  457. }
  458. // sort keys
  459. ksort($found_keys);
  460. $found = array();
  461. foreach ($found_keys as $k => $v) {
  462. $found[] = $this->dom->nodes[$k];
  463. }
  464. // return nth-element or array
  465. if (is_null($idx)) { return $found; }
  466. elseif ($idx < 0) { $idx = count($found) + $idx; }
  467. return (isset($found[$idx])) ? $found[$idx] : null;
  468. }
  469. protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
  470. {
  471. global $debug_object;
  472. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  473. list($tag, $id, $class, $attributes, $cmb) = $selector;
  474. $nodes = array();
  475. if ($parent_cmd === ' ') { // Descendant Combinator
  476. // Find parent closing tag if the current element doesn't have a closing
  477. // tag (i.e. void element)
  478. $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
  479. if ($end == 0) {
  480. $parent = $this->parent;
  481. while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
  482. $end -= 1;
  483. $parent = $parent->parent;
  484. }
  485. $end += $parent->_[HDOM_INFO_END];
  486. }
  487. // Get list of target nodes
  488. $nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
  489. $nodes_count = $end - $nodes_start;
  490. $nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
  491. } elseif ($parent_cmd === '>') { // Child Combinator
  492. $nodes = $this->children;
  493. } elseif ($parent_cmd === '+'
  494. && $this->parent
  495. && in_array($this, $this->parent->children)) { // Next-Sibling Combinator
  496. $index = array_search($this, $this->parent->children, true) + 1;
  497. if ($index < count($this->parent->children))
  498. $nodes[] = $this->parent->children[$index];
  499. } elseif ($parent_cmd === '~'
  500. && $this->parent
  501. && in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
  502. $index = array_search($this, $this->parent->children, true);
  503. $nodes = array_slice($this->parent->children, $index);
  504. }
  505. // Go throgh each element starting at this element until the end tag
  506. // Note: If this element is a void tag, any previous void element is
  507. // skipped.
  508. foreach($nodes as $node) {
  509. $pass = true;
  510. // Skip root nodes
  511. if(!$node->parent) {
  512. $pass = false;
  513. }
  514. // Skip if node isn't a child node (i.e. text nodes)
  515. if($pass && !in_array($node, $node->parent->children, true)) {
  516. $pass = false;
  517. }
  518. // Skip if tag doesn't match
  519. if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
  520. $pass = false;
  521. }
  522. // Skip if ID doesn't exist
  523. if ($pass && $id !== '' && !isset($node->attr['id'])) {
  524. $pass = false;
  525. }
  526. // Check if ID matches
  527. if ($pass && $id !== '' && isset($node->attr['id'])) {
  528. // Note: Only consider the first ID (as browsers do)
  529. $node_id = explode(' ', trim($node->attr['id']))[0];
  530. if($id !== $node_id) { $pass = false; }
  531. }
  532. // Check if all class(es) exist
  533. if ($pass && $class !== '' && is_array($class) && !empty($class)) {
  534. if (isset($node->attr['class'])) {
  535. $node_classes = explode(' ', $node->attr['class']);
  536. if ($lowercase) {
  537. $node_classes = array_map('strtolower', $node_classes);
  538. }
  539. foreach($class as $c) {
  540. if(!in_array($c, $node_classes)) {
  541. $pass = false;
  542. break;
  543. }
  544. }
  545. } else {
  546. $pass = false;
  547. }
  548. }
  549. // Check attributes
  550. if ($pass
  551. && $attributes !== ''
  552. && is_array($attributes)
  553. && !empty($attributes)) {
  554. foreach($attributes as $a) {
  555. list (
  556. $att_name,
  557. $att_expr,
  558. $att_val,
  559. $att_inv,
  560. $att_case_sensitivity
  561. ) = $a;
  562. // Handle indexing attributes (i.e. "[2]")
  563. /**
  564. * Note: This is not supported by the CSS Standard but adds
  565. * the ability to select items compatible to XPath (i.e.
  566. * the 3rd element within it's parent).
  567. *
  568. * Note: This doesn't conflict with the CSS Standard which
  569. * doesn't work on numeric attributes anyway.
  570. */
  571. if (is_numeric($att_name)
  572. && $att_expr === ''
  573. && $att_val === '') {
  574. $count = 0;
  575. // Find index of current element in parent
  576. foreach ($node->parent->children as $c) {
  577. if ($c->tag === $node->tag) ++$count;
  578. if ($c === $node) break;
  579. }
  580. // If this is the correct node, continue with next
  581. // attribute
  582. if ($count === (int)$att_name) continue;
  583. }
  584. // Check attribute availability
  585. if ($att_inv) { // Attribute should NOT be set
  586. if (isset($node->attr[$att_name])) {
  587. $pass = false;
  588. break;
  589. }
  590. } else { // Attribute should be set
  591. // todo: "plaintext" is not a valid CSS selector!
  592. if ($att_name !== 'plaintext'
  593. && !isset($node->attr[$att_name])) {
  594. $pass = false;
  595. break;
  596. }
  597. }
  598. // Continue with next attribute if expression isn't defined
  599. if ($att_expr === '') continue;
  600. // If they have told us that this is a "plaintext"
  601. // search then we want the plaintext of the node - right?
  602. // todo "plaintext" is not a valid CSS selector!
  603. if ($att_name === 'plaintext') {
  604. $nodeKeyValue = $node->text();
  605. } else {
  606. $nodeKeyValue = $node->attr[$att_name];
  607. }
  608. if (is_object($debug_object)) {
  609. $debug_object->debug_log(2,
  610. 'testing node: '
  611. . $node->tag
  612. . ' for attribute: '
  613. . $att_name
  614. . $att_expr
  615. . $att_val
  616. . ' where nodes value is: '
  617. . $nodeKeyValue
  618. );
  619. }
  620. // If lowercase is set, do a case insensitive test of
  621. // the value of the selector.
  622. if ($lowercase) {
  623. $check = $this->match(
  624. $att_expr,
  625. strtolower($att_val),
  626. strtolower($nodeKeyValue),
  627. $att_case_sensitivity
  628. );
  629. } else {
  630. $check = $this->match(
  631. $att_expr,
  632. $att_val,
  633. $nodeKeyValue,
  634. $att_case_sensitivity
  635. );
  636. }
  637. if (is_object($debug_object)) {
  638. $debug_object->debug_log(2,
  639. 'after match: '
  640. . ($check ? 'true' : 'false')
  641. );
  642. }
  643. if (!$check) {
  644. $pass = false;
  645. break;
  646. }
  647. }
  648. }
  649. // Found a match. Add to list and clear node
  650. if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
  651. unset($node);
  652. }
  653. // It's passed by reference so this is actually what this function returns.
  654. if (is_object($debug_object)) {
  655. $debug_object->debug_log(1, 'EXIT - ret: ', $ret);
  656. }
  657. }
  658. protected function match($exp, $pattern, $value, $case_sensitivity)
  659. {
  660. global $debug_object;
  661. if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
  662. if ($case_sensitivity === 'i') {
  663. $pattern = strtolower($pattern);
  664. $value = strtolower($value);
  665. }
  666. switch ($exp) {
  667. case '=':
  668. return ($value === $pattern);
  669. case '!=':
  670. return ($value !== $pattern);
  671. case '^=':
  672. return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
  673. case '$=':
  674. return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
  675. case '*=':
  676. return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
  677. case '|=':
  678. /**
  679. * [att|=val]
  680. *
  681. * Represents an element with the att attribute, its value
  682. * either being exactly "val" or beginning with "val"
  683. * immediately followed by "-" (U+002D).
  684. */
  685. return strpos($value, $pattern) === 0;
  686. case '~=':
  687. /**
  688. * [att~=val]
  689. *
  690. * Represents an element with the att attribute whose value is a
  691. * whitespace-separated list of words, one of which is exactly
  692. * "val". If "val" contains whitespace, it will never represent
  693. * anything (since the words are separated by spaces). Also if
  694. * "val" is the empty string, it will never represent anything.
  695. */
  696. return in_array($pattern, explode(' ', trim($value)), true);
  697. }
  698. return false;
  699. }
  700. protected function parse_selector($selector_string)
  701. {
  702. global $debug_object;
  703. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  704. /**
  705. * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
  706. *
  707. * Paperg: Add the colon to the attribute, so that it properly finds
  708. * <tag attr:ibute="something" > like google does.
  709. *
  710. * Note: if you try to look at this attribute, you MUST use getAttribute
  711. * since $dom->x:y will fail the php syntax check.
  712. *
  713. * Notice the \[ starting the attribute? and the @? following? This
  714. * implies that an attribute can begin with an @ sign that is not
  715. * captured. This implies that an html attribute specifier may start
  716. * with an @ sign that is NOT captured by the expression. Farther study
  717. * is required to determine of this should be documented or removed.
  718. *
  719. * Matches selectors in this order:
  720. *
  721. * [0] - full match
  722. *
  723. * [1] - tag name
  724. * ([\w:\*-]*)
  725. * Matches the tag name consisting of zero or more words, colons,
  726. * asterisks and hyphens.
  727. *
  728. * [2] - id name
  729. * (?:\#([\w-]+))
  730. * Optionally matches a id name, consisting of an "#" followed by
  731. * the id name (one or more words and hyphens).
  732. *
  733. * [3] - class names (including dots)
  734. * (?:\.([\w\.-]+))?
  735. * Optionally matches a list of classs, consisting of an "."
  736. * followed by the class name (one or more words and hyphens)
  737. * where multiple classes can be chained (i.e. ".foo.bar.baz")
  738. *
  739. * [4] - attributes
  740. * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
  741. * Optionally matches the attributes list
  742. *
  743. * [5] - separator
  744. * ([\/, >+~]+)
  745. * Matches the selector list separator
  746. */
  747. // phpcs:ignore Generic.Files.LineLength
  748. $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
  749. preg_match_all(
  750. $pattern,
  751. trim($selector_string) . ' ', // Add final ' ' as pseudo separator
  752. $matches,
  753. PREG_SET_ORDER
  754. );
  755. if (is_object($debug_object)) {
  756. $debug_object->debug_log(2, 'Matches Array: ', $matches);
  757. }
  758. $selectors = array();
  759. $result = array();
  760. foreach ($matches as $m) {
  761. $m[0] = trim($m[0]);
  762. // Skip NoOps
  763. if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
  764. // Convert to lowercase
  765. if ($this->dom->lowercase) {
  766. $m[1] = strtolower($m[1]);
  767. }
  768. // Extract classes
  769. if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
  770. /* Extract attributes (pattern based on the pattern above!)
  771. * [0] - full match
  772. * [1] - attribute name
  773. * [2] - attribute expression
  774. * [3] - attribute value
  775. * [4] - case sensitivity
  776. *
  777. * Note: Attributes can be negated with a "!" prefix to their name
  778. */
  779. if($m[4] !== '') {
  780. preg_match_all(
  781. "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
  782. trim($m[4]),
  783. $attributes,
  784. PREG_SET_ORDER
  785. );
  786. // Replace element by array
  787. $m[4] = array();
  788. foreach($attributes as $att) {
  789. // Skip empty matches
  790. if(trim($att[0]) === '') { continue; }
  791. $inverted = (isset($att[1][0]) && $att[1][0] === '!');
  792. $m[4][] = array(
  793. $inverted ? substr($att[1], 1) : $att[1], // Name
  794. (isset($att[2])) ? $att[2] : '', // Expression
  795. (isset($att[3])) ? $att[3] : '', // Value
  796. $inverted, // Inverted Flag
  797. (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
  798. );
  799. }
  800. }
  801. // Sanitize Separator
  802. if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
  803. $m[5] = ' ';
  804. } else { // Other Separator
  805. $m[5] = trim($m[5]);
  806. }
  807. // Clear Separator if it's a Selector List
  808. if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
  809. // Remove full match before adding to results
  810. array_shift($m);
  811. $result[] = $m;
  812. if ($is_list) { // Selector List
  813. $selectors[] = $result;
  814. $result = array();
  815. }
  816. }
  817. if (count($result) > 0) { $selectors[] = $result; }
  818. return $selectors;
  819. }
  820. function __get($name)
  821. {
  822. if (isset($this->attr[$name])) {
  823. return $this->convert_text($this->attr[$name]);
  824. }
  825. switch ($name) {
  826. case 'outertext': return $this->outertext();
  827. case 'innertext': return $this->innertext();
  828. case 'plaintext': return $this->text();
  829. case 'xmltext': return $this->xmltext();
  830. default: return array_key_exists($name, $this->attr);
  831. }
  832. }
  833. function __set($name, $value)
  834. {
  835. global $debug_object;
  836. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  837. switch ($name) {
  838. case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
  839. case 'innertext':
  840. if (isset($this->_[HDOM_INFO_TEXT])) {
  841. return $this->_[HDOM_INFO_TEXT] = $value;
  842. }
  843. return $this->_[HDOM_INFO_INNER] = $value;
  844. }
  845. if (!isset($this->attr[$name])) {
  846. $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
  847. $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
  848. }
  849. $this->attr[$name] = $value;
  850. }
  851. function __isset($name)
  852. {
  853. switch ($name) {
  854. case 'outertext': return true;
  855. case 'innertext': return true;
  856. case 'plaintext': return true;
  857. }
  858. //no value attr: nowrap, checked selected...
  859. return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
  860. }
  861. function __unset($name)
  862. {
  863. if (isset($this->attr[$name])) { unset($this->attr[$name]); }
  864. }
  865. function convert_text($text)
  866. {
  867. global $debug_object;
  868. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  869. $converted_text = $text;
  870. $sourceCharset = '';
  871. $targetCharset = '';
  872. if ($this->dom) {
  873. $sourceCharset = strtoupper($this->dom->_charset);
  874. $targetCharset = strtoupper($this->dom->_target_charset);
  875. }
  876. if (is_object($debug_object)) {
  877. $debug_object->debug_log(3,
  878. 'source charset: '
  879. . $sourceCharset
  880. . ' target charaset: '
  881. . $targetCharset
  882. );
  883. }
  884. if (!empty($sourceCharset)
  885. && !empty($targetCharset)
  886. && (strcasecmp($sourceCharset, $targetCharset) != 0)) {
  887. // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
  888. if ((strcasecmp($targetCharset, 'UTF-8') == 0)
  889. && ($this->is_utf8($text))) {
  890. $converted_text = $text;
  891. } else {
  892. $converted_text = iconv($sourceCharset, $targetCharset, $text);
  893. }
  894. }
  895. // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
  896. if ($targetCharset === 'UTF-8') {
  897. if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
  898. $converted_text = substr($converted_text, 3);
  899. }
  900. if (substr($converted_text, -3) === "\xef\xbb\xbf") {
  901. $converted_text = substr($converted_text, 0, -3);
  902. }
  903. }
  904. return $converted_text;
  905. }
  906. static function is_utf8($str)
  907. {
  908. $c = 0; $b = 0;
  909. $bits = 0;
  910. $len = strlen($str);
  911. for($i = 0; $i < $len; $i++) {
  912. $c = ord($str[$i]);
  913. if($c > 128) {
  914. if(($c >= 254)) { return false; }
  915. elseif($c >= 252) { $bits = 6; }
  916. elseif($c >= 248) { $bits = 5; }
  917. elseif($c >= 240) { $bits = 4; }
  918. elseif($c >= 224) { $bits = 3; }
  919. elseif($c >= 192) { $bits = 2; }
  920. else { return false; }
  921. if(($i + $bits) > $len) { return false; }
  922. while($bits > 1) {
  923. $i++;
  924. $b = ord($str[$i]);
  925. if($b < 128 || $b > 191) { return false; }
  926. $bits--;
  927. }
  928. }
  929. }
  930. return true;
  931. }
  932. function get_display_size()
  933. {
  934. global $debug_object;
  935. $width = -1;
  936. $height = -1;
  937. if ($this->tag !== 'img') {
  938. return false;
  939. }
  940. // See if there is aheight or width attribute in the tag itself.
  941. if (isset($this->attr['width'])) {
  942. $width = $this->attr['width'];
  943. }
  944. if (isset($this->attr['height'])) {
  945. $height = $this->attr['height'];
  946. }
  947. // Now look for an inline style.
  948. if (isset($this->attr['style'])) {
  949. // Thanks to user gnarf from stackoverflow for this regular expression.
  950. $attributes = array();
  951. preg_match_all(
  952. '/([\w-]+)\s*:\s*([^;]+)\s*;?/',
  953. $this->attr['style'],
  954. $matches,
  955. PREG_SET_ORDER
  956. );
  957. foreach ($matches as $match) {
  958. $attributes[$match[1]] = $match[2];
  959. }
  960. // If there is a width in the style attributes:
  961. if (isset($attributes['width']) && $width == -1) {
  962. // check that the last two characters are px (pixels)
  963. if (strtolower(substr($attributes['width'], -2)) === 'px') {
  964. $proposed_width = substr($attributes['width'], 0, -2);
  965. // Now make sure that it's an integer and not something stupid.
  966. if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
  967. $width = $proposed_width;
  968. }
  969. }
  970. }
  971. // If there is a width in the style attributes:
  972. if (isset($attributes['height']) && $height == -1) {
  973. // check that the last two characters are px (pixels)
  974. if (strtolower(substr($attributes['height'], -2)) == 'px') {
  975. $proposed_height = substr($attributes['height'], 0, -2);
  976. // Now make sure that it's an integer and not something stupid.
  977. if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
  978. $height = $proposed_height;
  979. }
  980. }
  981. }
  982. }
  983. // Future enhancement:
  984. // Look in the tag to see if there is a class or id specified that has
  985. // a height or width attribute to it.
  986. // Far future enhancement
  987. // Look at all the parent tags of this image to see if they specify a
  988. // class or id that has an img selector that specifies a height or width
  989. // Note that in this case, the class or id will have the img subselector
  990. // for it to apply to the image.
  991. // ridiculously far future development
  992. // If the class or id is specified in a SEPARATE css file thats not on
  993. // the page, go get it and do what we were just doing for the ones on
  994. // the page.
  995. $result = array(
  996. 'height' => $height,
  997. 'width' => $width
  998. );
  999. return $result;
  1000. }
  1001. function save($filepath = '')
  1002. {
  1003. $ret = $this->outertext();
  1004. if ($filepath !== '') {
  1005. file_put_contents($filepath, $ret, LOCK_EX);
  1006. }
  1007. return $ret;
  1008. }
  1009. function addClass($class)
  1010. {
  1011. if (is_string($class)) {
  1012. $class = explode(' ', $class);
  1013. }
  1014. if (is_array($class)) {
  1015. foreach($class as $c) {
  1016. if (isset($this->class)) {
  1017. if ($this->hasClass($c)) {
  1018. continue;
  1019. } else {
  1020. $this->class .= ' ' . $c;
  1021. }
  1022. } else {
  1023. $this->class = $c;
  1024. }
  1025. }
  1026. } else {
  1027. if (is_object($debug_object)) {
  1028. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1029. }
  1030. }
  1031. }
  1032. function hasClass($class)
  1033. {
  1034. if (is_string($class)) {
  1035. if (isset($this->class)) {
  1036. return in_array($class, explode(' ', $this->class), true);
  1037. }
  1038. } else {
  1039. if (is_object($debug_object)) {
  1040. $debug_object->debug_log(2, 'Invalid type: ', gettype($class));
  1041. }
  1042. }
  1043. return false;
  1044. }
  1045. function removeClass($class = null)
  1046. {
  1047. if (!isset($this->class)) {
  1048. return;
  1049. }
  1050. if (is_null($class)) {
  1051. $this->removeAttribute('class');
  1052. return;
  1053. }
  1054. if (is_string($class)) {
  1055. $class = explode(' ', $class);
  1056. }
  1057. if (is_array($class)) {
  1058. $class = array_diff(explode(' ', $this->class), $class);
  1059. if (empty($class)) {
  1060. $this->removeAttribute('class');
  1061. } else {
  1062. $this->class = implode(' ', $class);
  1063. }
  1064. }
  1065. }
  1066. function getAllAttributes()
  1067. {
  1068. return $this->attr;
  1069. }
  1070. function getAttribute($name)
  1071. {
  1072. return $this->__get($name);
  1073. }
  1074. function setAttribute($name, $value)
  1075. {
  1076. $this->__set($name, $value);
  1077. }
  1078. function hasAttribute($name)
  1079. {
  1080. return $this->__isset($name);
  1081. }
  1082. function removeAttribute($name)
  1083. {
  1084. $this->__set($name, null);
  1085. }
  1086. function remove()
  1087. {
  1088. if ($this->parent) {
  1089. $this->parent->removeChild($this);
  1090. }
  1091. }
  1092. function removeChild($node)
  1093. {
  1094. $nidx = array_search($node, $this->nodes, true);
  1095. $cidx = array_search($node, $this->children, true);
  1096. $didx = array_search($node, $this->dom->nodes, true);
  1097. if ($nidx !== false && $cidx !== false && $didx !== false) {
  1098. foreach($node->children as $child) {
  1099. $node->removeChild($child);
  1100. }
  1101. foreach($node->nodes as $entity) {
  1102. $enidx = array_search($entity, $node->nodes, true);
  1103. $edidx = array_search($entity, $node->dom->nodes, true);
  1104. if ($enidx !== false && $edidx !== false) {
  1105. unset($node->nodes[$enidx]);
  1106. unset($node->dom->nodes[$edidx]);
  1107. }
  1108. }
  1109. unset($this->nodes[$nidx]);
  1110. unset($this->children[$cidx]);
  1111. unset($this->dom->nodes[$didx]);
  1112. $node->clear();
  1113. }
  1114. }
  1115. function getElementById($id)
  1116. {
  1117. return $this->find("#$id", 0);
  1118. }
  1119. function getElementsById($id, $idx = null)
  1120. {
  1121. return $this->find("#$id", $idx);
  1122. }
  1123. function getElementByTagName($name)
  1124. {
  1125. return $this->find($name, 0);
  1126. }
  1127. function getElementsByTagName($name, $idx = null)
  1128. {
  1129. return $this->find($name, $idx);
  1130. }
  1131. function parentNode()
  1132. {
  1133. return $this->parent();
  1134. }
  1135. function childNodes($idx = -1)
  1136. {
  1137. return $this->children($idx);
  1138. }
  1139. function firstChild()
  1140. {
  1141. return $this->first_child();
  1142. }
  1143. function lastChild()
  1144. {
  1145. return $this->last_child();
  1146. }
  1147. function nextSibling()
  1148. {
  1149. return $this->next_sibling();
  1150. }
  1151. function previousSibling()
  1152. {
  1153. return $this->prev_sibling();
  1154. }
  1155. function hasChildNodes()
  1156. {
  1157. return $this->has_child();
  1158. }
  1159. function nodeName()
  1160. {
  1161. return $this->tag;
  1162. }
  1163. function appendChild($node)
  1164. {
  1165. $node->parent($this);
  1166. return $node;
  1167. }
  1168. }
  1169. class simple_html_dom
  1170. {
  1171. public $root = null;
  1172. public $nodes = array();
  1173. public $callback = null;
  1174. public $lowercase = false;
  1175. public $original_size;
  1176. public $size;
  1177. protected $pos;
  1178. protected $doc;
  1179. protected $char;
  1180. protected $cursor;
  1181. protected $parent;
  1182. protected $noise = array();
  1183. protected $token_blank = " \t\r\n";
  1184. protected $token_equal = ' =/>';
  1185. protected $token_slash = " />\r\n\t";
  1186. protected $token_attr = ' >';
  1187. public $_charset = '';
  1188. public $_target_charset = '';
  1189. protected $default_br_text = '';
  1190. public $default_span_text = '';
  1191. protected $self_closing_tags = array(
  1192. 'area' => 1,
  1193. 'base' => 1,
  1194. 'br' => 1,
  1195. 'col' => 1,
  1196. 'embed' => 1,
  1197. 'hr' => 1,
  1198. 'img' => 1,
  1199. 'input' => 1,
  1200. 'link' => 1,
  1201. 'meta' => 1,
  1202. 'param' => 1,
  1203. 'source' => 1,
  1204. 'track' => 1,
  1205. 'wbr' => 1
  1206. );
  1207. protected $block_tags = array(
  1208. 'body' => 1,
  1209. 'div' => 1,
  1210. 'form' => 1,
  1211. 'root' => 1,
  1212. 'span' => 1,
  1213. 'table' => 1
  1214. );
  1215. protected $optional_closing_tags = array(
  1216. // Not optional, see
  1217. // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
  1218. 'b' => array('b' => 1),
  1219. 'dd' => array('dd' => 1, 'dt' => 1),
  1220. // Not optional, see
  1221. // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
  1222. 'dl' => array('dd' => 1, 'dt' => 1),
  1223. 'dt' => array('dd' => 1, 'dt' => 1),
  1224. 'li' => array('li' => 1),
  1225. 'optgroup' => array('optgroup' => 1, 'option' => 1),
  1226. 'option' => array('optgroup' => 1, 'option' => 1),
  1227. 'p' => array('p' => 1),
  1228. 'rp' => array('rp' => 1, 'rt' => 1),
  1229. 'rt' => array('rp' => 1, 'rt' => 1),
  1230. 'td' => array('td' => 1, 'th' => 1),
  1231. 'th' => array('td' => 1, 'th' => 1),
  1232. 'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
  1233. );
  1234. function __construct(
  1235. $str = null,
  1236. $lowercase = true,
  1237. $forceTagsClosed = true,
  1238. $target_charset = DEFAULT_TARGET_CHARSET,
  1239. $stripRN = true,
  1240. $defaultBRText = DEFAULT_BR_TEXT,
  1241. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1242. $options = 0)
  1243. {
  1244. if ($str) {
  1245. if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
  1246. $this->load_file($str);
  1247. } else {
  1248. $this->load(
  1249. $str,
  1250. $lowercase,
  1251. $stripRN,
  1252. $defaultBRText,
  1253. $defaultSpanText,
  1254. $options
  1255. );
  1256. }
  1257. }
  1258. // Forcing tags to be closed implies that we don't trust the html, but
  1259. // it can lead to parsing errors if we SHOULD trust the html.
  1260. if (!$forceTagsClosed) {
  1261. $this->optional_closing_array = array();
  1262. }
  1263. $this->_target_charset = $target_charset;
  1264. }
  1265. function __destruct()
  1266. {
  1267. $this->clear();
  1268. }
  1269. function load(
  1270. $str,
  1271. $lowercase = true,
  1272. $stripRN = true,
  1273. $defaultBRText = DEFAULT_BR_TEXT,
  1274. $defaultSpanText = DEFAULT_SPAN_TEXT,
  1275. $options = 0)
  1276. {
  1277. global $debug_object;
  1278. // prepare
  1279. $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
  1280. // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
  1281. // Script tags removal now preceeds style tag removal.
  1282. // strip out <script> tags
  1283. $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
  1284. $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
  1285. // strip out the \r \n's if we are told to.
  1286. if ($stripRN) {
  1287. $this->doc = str_replace("\r", ' ', $this->doc);
  1288. $this->doc = str_replace("\n", ' ', $this->doc);
  1289. // set the length of content since we have changed it.
  1290. $this->size = strlen($this->doc);
  1291. }
  1292. // strip out cdata
  1293. $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
  1294. // strip out comments
  1295. $this->remove_noise("'<!--(.*?)-->'is");
  1296. // strip out <style> tags
  1297. $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
  1298. $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
  1299. // strip out preformatted tags
  1300. $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
  1301. // strip out server side scripts
  1302. $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
  1303. if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
  1304. $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
  1305. }
  1306. // parsing
  1307. $this->parse();
  1308. // end
  1309. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1310. $this->parse_charset();
  1311. // make load function chainable
  1312. return $this;
  1313. }
  1314. function load_file()
  1315. {
  1316. $args = func_get_args();
  1317. if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
  1318. $this->load($doc, true);
  1319. } else {
  1320. return false;
  1321. }
  1322. }
  1323. function set_callback($function_name)
  1324. {
  1325. $this->callback = $function_name;
  1326. }
  1327. function remove_callback()
  1328. {
  1329. $this->callback = null;
  1330. }
  1331. function save($filepath = '')
  1332. {
  1333. $ret = $this->root->innertext();
  1334. if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
  1335. return $ret;
  1336. }
  1337. function find($selector, $idx = null, $lowercase = false)
  1338. {
  1339. return $this->root->find($selector, $idx, $lowercase);
  1340. }
  1341. function clear()
  1342. {
  1343. if (isset($this->nodes)) {
  1344. foreach ($this->nodes as $n) {
  1345. $n->clear();
  1346. $n = null;
  1347. }
  1348. }
  1349. // This add next line is documented in the sourceforge repository.
  1350. // 2977248 as a fix for ongoing memory leaks that occur even with the
  1351. // use of clear.
  1352. if (isset($this->children)) {
  1353. foreach ($this->children as $n) {
  1354. $n->clear();
  1355. $n = null;
  1356. }
  1357. }
  1358. if (isset($this->parent)) {
  1359. $this->parent->clear();
  1360. unset($this->parent);
  1361. }
  1362. if (isset($this->root)) {
  1363. $this->root->clear();
  1364. unset($this->root);
  1365. }
  1366. unset($this->doc);
  1367. unset($this->noise);
  1368. }
  1369. function dump($show_attr = true)
  1370. {
  1371. $this->root->dump($show_attr);
  1372. }
  1373. protected function prepare(
  1374. $str, $lowercase = true,
  1375. $defaultBRText = DEFAULT_BR_TEXT,
  1376. $defaultSpanText = DEFAULT_SPAN_TEXT)
  1377. {
  1378. $this->clear();
  1379. $this->doc = trim($str);
  1380. $this->size = strlen($this->doc);
  1381. $this->original_size = $this->size; // original size of the html
  1382. $this->pos = 0;
  1383. $this->cursor = 1;
  1384. $this->noise = array();
  1385. $this->nodes = array();
  1386. $this->lowercase = $lowercase;
  1387. $this->default_br_text = $defaultBRText;
  1388. $this->default_span_text = $defaultSpanText;
  1389. $this->root = new simple_html_dom_node($this);
  1390. $this->root->tag = 'root';
  1391. $this->root->_[HDOM_INFO_BEGIN] = -1;
  1392. $this->root->nodetype = HDOM_TYPE_ROOT;
  1393. $this->parent = $this->root;
  1394. if ($this->size > 0) { $this->char = $this->doc[0]; }
  1395. }
  1396. protected function parse()
  1397. {
  1398. while (true) {
  1399. // Read next tag if there is no text between current position and the
  1400. // next opening tag.
  1401. if (($s = $this->copy_until_char('<')) === '') {
  1402. if($this->read_tag()) {
  1403. continue;
  1404. } else {
  1405. return true;
  1406. }
  1407. }
  1408. // Add a text node for text between tags
  1409. $node = new simple_html_dom_node($this);
  1410. ++$this->cursor;
  1411. $node->_[HDOM_INFO_TEXT] = $s;
  1412. $this->link_nodes($node, false);
  1413. }
  1414. }
  1415. protected function parse_charset()
  1416. {
  1417. global $debug_object;
  1418. $charset = null;
  1419. if (function_exists('get_last_retrieve_url_contents_content_type')) {
  1420. $contentTypeHeader = get_last_retrieve_url_contents_content_type();
  1421. $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
  1422. if ($success) {
  1423. $charset = $matches[1];
  1424. if (is_object($debug_object)) {
  1425. $debug_object->debug_log(2,
  1426. 'header content-type found charset of: '
  1427. . $charset
  1428. );
  1429. }
  1430. }
  1431. }
  1432. if (empty($charset)) {
  1433. // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
  1434. $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
  1435. if (!empty($el)) {
  1436. $fullvalue = $el->content;
  1437. if (is_object($debug_object)) {
  1438. $debug_object->debug_log(2,
  1439. 'meta content-type tag found'
  1440. . $fullvalue
  1441. );
  1442. }
  1443. if (!empty($fullvalue)) {
  1444. $success = preg_match(
  1445. '/charset=(.+)/i',
  1446. $fullvalue,
  1447. $matches
  1448. );
  1449. if ($success) {
  1450. $charset = $matches[1];
  1451. } else {
  1452. // If there is a meta tag, and they don't specify the
  1453. // character set, research says that it's typically
  1454. // ISO-8859-1
  1455. if (is_object($debug_object)) {
  1456. $debug_object->debug_log(2,
  1457. 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
  1458. );
  1459. }
  1460. $charset = 'ISO-8859-1';
  1461. }
  1462. }
  1463. }
  1464. }
  1465. if (empty($charset)) {
  1466. // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
  1467. if ($meta = $this->root->find('meta[charset]', 0)) {
  1468. $charset = $meta->charset;
  1469. if (is_object($debug_object)) {
  1470. $debug_object->debug_log(2, 'meta charset: ' . $charset);
  1471. }
  1472. }
  1473. }
  1474. if (empty($charset)) {
  1475. // Try to guess the charset based on the content
  1476. // Requires Multibyte String (mbstring) support (optional)
  1477. if (function_exists('mb_detect_encoding')) {
  1478. /**
  1479. * mb_detect_encoding() is not intended to distinguish between
  1480. * charsets, especially single-byte charsets. Its primary
  1481. * purpose is to detect which multibyte encoding is in use,
  1482. * i.e. UTF-8, UTF-16, shift-JIS, etc.
  1483. *
  1484. * -- https://bugs.php.net/bug.php?id=38138
  1485. *
  1486. * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
  1487. * always result in CP1251/ISO-8859-5 and vice versa.
  1488. *
  1489. * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
  1490. * to stay compatible.
  1491. */
  1492. $encoding = mb_detect_encoding(
  1493. $this->doc,
  1494. array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
  1495. );
  1496. if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
  1497. // Due to a limitation of mb_detect_encoding
  1498. // 'CP1251'/'ISO-8859-5' will be detected as
  1499. // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
  1500. // which case we can simply assume it is the other charset.
  1501. if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
  1502. $encoding = 'CP1251';
  1503. }
  1504. }
  1505. if ($encoding !== false) {
  1506. $charset = $encoding;
  1507. if (is_object($debug_object)) {
  1508. $debug_object->debug_log(2, 'mb_detect: ' . $charset);
  1509. }
  1510. }
  1511. }
  1512. }
  1513. if (empty($charset)) {
  1514. // Assume it's UTF-8 as it is the most likely charset to be used
  1515. $charset = 'UTF-8';
  1516. if (is_object($debug_object)) {
  1517. $debug_object->debug_log(2, 'No match found, assume ' . $charset);
  1518. }
  1519. }
  1520. // Since CP1252 is a superset, if we get one of it's subsets, we want
  1521. // it instead.
  1522. if ((strtolower($charset) == 'iso-8859-1')
  1523. || (strtolower($charset) == 'latin1')
  1524. || (strtolower($charset) == 'latin-1')) {
  1525. $charset = 'CP1252';
  1526. if (is_object($debug_object)) {
  1527. $debug_object->debug_log(2,
  1528. 'replacing ' . $charset . ' with CP1252 as its a superset'
  1529. );
  1530. }
  1531. }
  1532. if (is_object($debug_object)) {
  1533. $debug_object->debug_log(1, 'EXIT - ' . $charset);
  1534. }
  1535. return $this->_charset = $charset;
  1536. }
  1537. protected function read_tag()
  1538. {
  1539. // Set end position if no further tags found
  1540. if ($this->char !== '<') {
  1541. $this->root->_[HDOM_INFO_END] = $this->cursor;
  1542. return false;
  1543. }
  1544. $begin_tag_pos = $this->pos;
  1545. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1546. // end tag
  1547. if ($this->char === '/') {
  1548. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1549. // Skip whitespace in end tags (i.e. in "</ html>")
  1550. $this->skip($this->token_blank);
  1551. $tag = $this->copy_until_char('>');
  1552. // Skip attributes in end tags
  1553. if (($pos = strpos($tag, ' ')) !== false) {
  1554. $tag = substr($tag, 0, $pos);
  1555. }
  1556. $parent_lower = strtolower($this->parent->tag);
  1557. $tag_lower = strtolower($tag);
  1558. // The end tag is supposed to close the parent tag. Handle situations
  1559. // when it doesn't
  1560. if ($parent_lower !== $tag_lower) {
  1561. // Parent tag does not have to be closed necessarily (optional closing tag)
  1562. // Current tag is a block tag, so it may close an ancestor
  1563. if (isset($this->optional_closing_tags[$parent_lower])
  1564. && isset($this->block_tags[$tag_lower])) {
  1565. $this->parent->_[HDOM_INFO_END] = 0;
  1566. $org_parent = $this->parent;
  1567. // Traverse ancestors to find a matching opening tag
  1568. // Stop at root node
  1569. while (($this->parent->parent)
  1570. && strtolower($this->parent->tag) !== $tag_lower
  1571. ){
  1572. $this->parent = $this->parent->parent;
  1573. }
  1574. // If we don't have a match add current tag as text node
  1575. if (strtolower($this->parent->tag) !== $tag_lower) {
  1576. $this->parent = $org_parent; // restore origonal parent
  1577. if ($this->parent->parent) {
  1578. $this->parent = $this->parent->parent;
  1579. }
  1580. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1581. return $this->as_text_node($tag);
  1582. }
  1583. } elseif (($this->parent->parent)
  1584. && isset($this->block_tags[$tag_lower])
  1585. ) {
  1586. // Grandparent exists and current tag is a block tag, so our
  1587. // parent doesn't have an end tag
  1588. $this->parent->_[HDOM_INFO_END] = 0; // No end tag
  1589. $org_parent = $this->parent;
  1590. // Traverse ancestors to find a matching opening tag
  1591. // Stop at root node
  1592. while (($this->parent->parent)
  1593. && strtolower($this->parent->tag) !== $tag_lower
  1594. ) {
  1595. $this->parent = $this->parent->parent;
  1596. }
  1597. // If we don't have a match add current tag as text node
  1598. if (strtolower($this->parent->tag) !== $tag_lower) {
  1599. $this->parent = $org_parent; // restore origonal parent
  1600. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1601. return $this->as_text_node($tag);
  1602. }
  1603. } elseif (($this->parent->parent)
  1604. && strtolower($this->parent->parent->tag) === $tag_lower
  1605. ) { // Grandparent exists and current tag closes it
  1606. $this->parent->_[HDOM_INFO_END] = 0;
  1607. $this->parent = $this->parent->parent;
  1608. } else { // Random tag, add as text node
  1609. return $this->as_text_node($tag);
  1610. }
  1611. }
  1612. // Set end position of parent tag to current cursor position
  1613. $this->parent->_[HDOM_INFO_END] = $this->cursor;
  1614. if ($this->parent->parent) {
  1615. $this->parent = $this->parent->parent;
  1616. }
  1617. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1618. return true;
  1619. }
  1620. // start tag
  1621. $node = new simple_html_dom_node($this);
  1622. $node->_[HDOM_INFO_BEGIN] = $this->cursor;
  1623. ++$this->cursor;
  1624. $tag = $this->copy_until($this->token_slash); // Get tag name
  1625. $node->tag_start = $begin_tag_pos;
  1626. // doctype, cdata & comments...
  1627. // <!DOCTYPE html>
  1628. // <![CDATA[ ... ]]>
  1629. // <!-- Comment -->
  1630. if (isset($tag[0]) && $tag[0] === '!') {
  1631. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
  1632. if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
  1633. $node->nodetype = HDOM_TYPE_COMMENT;
  1634. $node->tag = 'comment';
  1635. } else { // Could be doctype or CDATA but we don't care
  1636. $node->nodetype = HDOM_TYPE_UNKNOWN;
  1637. $node->tag = 'unknown';
  1638. }
  1639. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1640. $this->link_nodes($node, true);
  1641. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1642. return true;
  1643. }
  1644. // The start tag cannot contain another start tag, if so add as text
  1645. // i.e. "<<html>"
  1646. if ($pos = strpos($tag, '<') !== false) {
  1647. $tag = '<' . substr($tag, 0, -1);
  1648. $node->_[HDOM_INFO_TEXT] = $tag;
  1649. $this->link_nodes($node, false);
  1650. $this->char = $this->doc[--$this->pos]; // prev
  1651. return true;
  1652. }
  1653. // Handle invalid tag names (i.e. "<html#doc>")
  1654. if (!preg_match('/^\w[\w:-]*$/', $tag)) {
  1655. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
  1656. // Next char is the beginning of a new tag, don't touch it.
  1657. if ($this->char === '<') {
  1658. $this->link_nodes($node, false);
  1659. return true;
  1660. }
  1661. // Next char closes current tag, add and be done with it.
  1662. if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
  1663. $this->link_nodes($node, false);
  1664. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1665. return true;
  1666. }
  1667. // begin tag, add new node
  1668. $node->nodetype = HDOM_TYPE_ELEMENT;
  1669. $tag_lower = strtolower($tag);
  1670. $node->tag = ($this->lowercase) ? $tag_lower : $tag;
  1671. // handle optional closing tags
  1672. if (isset($this->optional_closing_tags[$tag_lower])) {
  1673. // Traverse ancestors to close all optional closing tags
  1674. while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
  1675. $this->parent->_[HDOM_INFO_END] = 0;
  1676. $this->parent = $this->parent->parent;
  1677. }
  1678. $node->parent = $this->parent;
  1679. }
  1680. $guard = 0; // prevent infinity loop
  1681. // [0] Space between tag and first attribute
  1682. $space = array($this->copy_skip($this->token_blank), '', '');
  1683. // attributes
  1684. do {
  1685. // Everything until the first equal sign should be the attribute name
  1686. $name = $this->copy_until($this->token_equal);
  1687. if ($name === '' && $this->char !== null && $space[0] === '') {
  1688. break;
  1689. }
  1690. if ($guard === $this->pos) { // Escape infinite loop
  1691. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1692. continue;
  1693. }
  1694. $guard = $this->pos;
  1695. // handle endless '<'
  1696. // Out of bounds before the tag ended
  1697. if ($this->pos >= $this->size - 1 && $this->char !== '>') {
  1698. $node->nodetype = HDOM_TYPE_TEXT;
  1699. $node->_[HDOM_INFO_END] = 0;
  1700. $node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
  1701. $node->tag = 'text';
  1702. $this->link_nodes($node, false);
  1703. return true;
  1704. }
  1705. // handle mismatch '<'
  1706. // Attributes cannot start after opening tag
  1707. if ($this->doc[$this->pos - 1] == '<') {
  1708. $node->nodetype = HDOM_TYPE_TEXT;
  1709. $node->tag = 'text';
  1710. $node->attr = array();
  1711. $node->_[HDOM_INFO_END] = 0;
  1712. $node->_[HDOM_INFO_TEXT] = substr(
  1713. $this->doc,
  1714. $begin_tag_pos,
  1715. $this->pos - $begin_tag_pos - 1
  1716. );
  1717. $this->pos -= 2;
  1718. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1719. $this->link_nodes($node, false);
  1720. return true;
  1721. }
  1722. if ($name !== '/' && $name !== '') { // this is a attribute name
  1723. // [1] Whitespace after attribute name
  1724. $space[1] = $this->copy_skip($this->token_blank);
  1725. $name = $this->restore_noise($name); // might be a noisy name
  1726. if ($this->lowercase) { $name = strtolower($name); }
  1727. if ($this->char === '=') { // attribute with value
  1728. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1729. $this->parse_attr($node, $name, $space); // get attribute value
  1730. } else {
  1731. //no value attr: nowrap, checked selected...
  1732. $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
  1733. $node->attr[$name] = true;
  1734. if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
  1735. }
  1736. $node->_[HDOM_INFO_SPACE][] = $space;
  1737. // prepare for next attribute
  1738. $space = array(
  1739. $this->copy_skip($this->token_blank),
  1740. '',
  1741. ''
  1742. );
  1743. } else { // no more attributes
  1744. break;
  1745. }
  1746. } while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
  1747. $this->link_nodes($node, true);
  1748. $node->_[HDOM_INFO_ENDSPACE] = $space[0];
  1749. // handle empty tags (i.e. "<div/>")
  1750. if ($this->copy_until_char('>') === '/') {
  1751. $node->_[HDOM_INFO_ENDSPACE] .= '/';
  1752. $node->_[HDOM_INFO_END] = 0;
  1753. } else {
  1754. // reset parent
  1755. if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
  1756. $this->parent = $node;
  1757. }
  1758. }
  1759. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1760. // If it's a BR tag, we need to set it's text to the default text.
  1761. // This way when we see it in plaintext, we can generate formatting that the user wants.
  1762. // since a br tag never has sub nodes, this works well.
  1763. if ($node->tag === 'br') {
  1764. $node->_[HDOM_INFO_INNER] = $this->default_br_text;
  1765. }
  1766. return true;
  1767. }
  1768. protected function parse_attr($node, $name, &$space)
  1769. {
  1770. $is_duplicate = isset($node->attr[$name]);
  1771. if (!$is_duplicate) // Copy whitespace between "=" and value
  1772. $space[2] = $this->copy_skip($this->token_blank);
  1773. switch ($this->char) {
  1774. case '"':
  1775. $quote_type = HDOM_QUOTE_DOUBLE;
  1776. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1777. $value = $this->copy_until_char('"');
  1778. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1779. break;
  1780. case '\'':
  1781. $quote_type = HDOM_QUOTE_SINGLE;
  1782. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1783. $value = $this->copy_until_char('\'');
  1784. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1785. break;
  1786. default:
  1787. $quote_type = HDOM_QUOTE_NO;
  1788. $value = $this->copy_until($this->token_attr);
  1789. }
  1790. $value = $this->restore_noise($value);
  1791. // PaperG: Attributes should not have \r or \n in them, that counts as
  1792. // html whitespace.
  1793. // giterlizzi: Fix for DokuWiki Bootstrap Template
  1794. if ($this->strip_rn) {
  1795. $value = str_replace("\r", '', $value);
  1796. $value = str_replace("\n", '', $value);
  1797. }
  1798. // PaperG: If this is a "class" selector, lets get rid of the preceeding
  1799. // and trailing space since some people leave it in the multi class case.
  1800. if ($name === 'class') {
  1801. $value = trim($value);
  1802. }
  1803. if (!$is_duplicate) {
  1804. $node->_[HDOM_INFO_QUOTE][] = $quote_type;
  1805. $node->attr[$name] = $value;
  1806. }
  1807. }
  1808. protected function link_nodes(&$node, $is_child)
  1809. {
  1810. $node->parent = $this->parent;
  1811. $this->parent->nodes[] = $node;
  1812. if ($is_child) {
  1813. $this->parent->children[] = $node;
  1814. }
  1815. }
  1816. protected function as_text_node($tag)
  1817. {
  1818. $node = new simple_html_dom_node($this);
  1819. ++$this->cursor;
  1820. $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
  1821. $this->link_nodes($node, false);
  1822. $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1823. return true;
  1824. }
  1825. protected function skip($chars)
  1826. {
  1827. $this->pos += strspn($this->doc, $chars, $this->pos);
  1828. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1829. }
  1830. protected function copy_skip($chars)
  1831. {
  1832. $pos = $this->pos;
  1833. $len = strspn($this->doc, $chars, $pos);
  1834. $this->pos += $len;
  1835. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1836. if ($len === 0) { return ''; }
  1837. return substr($this->doc, $pos, $len);
  1838. }
  1839. protected function copy_until($chars)
  1840. {
  1841. $pos = $this->pos;
  1842. $len = strcspn($this->doc, $chars, $pos);
  1843. $this->pos += $len;
  1844. $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
  1845. return substr($this->doc, $pos, $len);
  1846. }
  1847. protected function copy_until_char($char)
  1848. {
  1849. if ($this->char === null) { return ''; }
  1850. if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
  1851. $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
  1852. $this->char = null;
  1853. $this->pos = $this->size;
  1854. return $ret;
  1855. }
  1856. if ($pos === $this->pos) { return ''; }
  1857. $pos_old = $this->pos;
  1858. $this->char = $this->doc[$pos];
  1859. $this->pos = $pos;
  1860. return substr($this->doc, $pos_old, $pos - $pos_old);
  1861. }
  1862. protected function remove_noise($pattern, $remove_tag = false)
  1863. {
  1864. global $debug_object;
  1865. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1866. $count = preg_match_all(
  1867. $pattern,
  1868. $this->doc,
  1869. $matches,
  1870. PREG_SET_ORDER | PREG_OFFSET_CAPTURE
  1871. );
  1872. for ($i = $count - 1; $i > -1; --$i) {
  1873. $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
  1874. if (is_object($debug_object)) {
  1875. $debug_object->debug_log(2, 'key is: ' . $key);
  1876. }
  1877. $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
  1878. $this->noise[$key] = $matches[$i][$idx][0];
  1879. $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
  1880. }
  1881. // reset the length of content
  1882. $this->size = strlen($this->doc);
  1883. if ($this->size > 0) {
  1884. $this->char = $this->doc[0];
  1885. }
  1886. }
  1887. function restore_noise($text)
  1888. {
  1889. global $debug_object;
  1890. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1891. while (($pos = strpos($text, '___noise___')) !== false) {
  1892. // Sometimes there is a broken piece of markup, and we don't GET the
  1893. // pos+11 etc... token which indicates a problem outside of us...
  1894. // todo: "___noise___1000" (or any number with four or more digits)
  1895. // in the DOM causes an infinite loop which could be utilized by
  1896. // malicious software
  1897. if (strlen($text) > $pos + 15) {
  1898. $key = '___noise___'
  1899. . $text[$pos + 11]
  1900. . $text[$pos + 12]
  1901. . $text[$pos + 13]
  1902. . $text[$pos + 14]
  1903. . $text[$pos + 15];
  1904. if (is_object($debug_object)) {
  1905. $debug_object->debug_log(2, 'located key of: ' . $key);
  1906. }
  1907. if (isset($this->noise[$key])) {
  1908. $text = substr($text, 0, $pos)
  1909. . $this->noise[$key]
  1910. . substr($text, $pos + 16);
  1911. } else {
  1912. // do this to prevent an infinite loop.
  1913. $text = substr($text, 0, $pos)
  1914. . 'UNDEFINED NOISE FOR KEY: '
  1915. . $key
  1916. . substr($text, $pos + 16);
  1917. }
  1918. } else {
  1919. // There is no valid key being given back to us... We must get
  1920. // rid of the ___noise___ or we will have a problem.
  1921. $text = substr($text, 0, $pos)
  1922. . 'NO NUMERIC NOISE KEY'
  1923. . substr($text, $pos + 11);
  1924. }
  1925. }
  1926. return $text;
  1927. }
  1928. function search_noise($text)
  1929. {
  1930. global $debug_object;
  1931. if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
  1932. foreach($this->noise as $noiseElement) {
  1933. if (strpos($noiseElement, $text) !== false) {
  1934. return $noiseElement;
  1935. }
  1936. }
  1937. }
  1938. function __toString()
  1939. {
  1940. return $this->root->innertext();
  1941. }
  1942. function __get($name)
  1943. {
  1944. switch ($name) {
  1945. case 'outertext':
  1946. return $this->root->innertext();
  1947. case 'innertext':
  1948. return $this->root->innertext();
  1949. case 'plaintext':
  1950. return $this->root->text();
  1951. case 'charset':
  1952. return $this->_charset;
  1953. case 'target_charset':
  1954. return $this->_target_charset;
  1955. }
  1956. }
  1957. function childNodes($idx = -1)
  1958. {
  1959. return $this->root->childNodes($idx);
  1960. }
  1961. function firstChild()
  1962. {
  1963. return $this->root->first_child();
  1964. }
  1965. function lastChild()
  1966. {
  1967. return $this->root->last_child();
  1968. }
  1969. function createElement($name, $value = null)
  1970. {
  1971. return @str_get_html("<$name>$value</$name>")->firstChild();
  1972. }
  1973. function createTextNode($value)
  1974. {
  1975. return @end(str_get_html($value)->nodes);
  1976. }
  1977. function getElementById($id)
  1978. {
  1979. return $this->find("#$id", 0);
  1980. }
  1981. function getElementsById($id, $idx = null)
  1982. {
  1983. return $this->find("#$id", $idx);
  1984. }
  1985. function getElementByTagName($name)
  1986. {
  1987. return $this->find($name, 0);
  1988. }
  1989. function getElementsByTagName($name, $idx = -1)
  1990. {
  1991. return $this->find($name, $idx);
  1992. }
  1993. function loadFile()
  1994. {
  1995. $args = func_get_args();
  1996. $this->load_file($args);
  1997. }
  1998. }