diff --git a/.travis.yml b/.travis.yml index 59a4a2e..42ae403 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,8 @@ language: php php: - - 5.3 - - 5.4 - - 5.5 - - 5.6 - - hhvm + - 7.2 + - 7.3 before_script: - composer self-update diff --git a/README.md b/README.md index 1ec1c62..041c160 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ HtmlParser =============== - +[](https://packagist.org/packages/bupt1987/html-parser) [](https://travis-ci.org/bupt1987/html-parser) php html解析工具,类似与PHP Simple HTML DOM Parser。 diff --git a/composer.json b/composer.json index 7dc101b..dc5e1a6 100644 --- a/composer.json +++ b/composer.json @@ -12,11 +12,10 @@ } ], "require": { - "php": ">=5.3.2", - "ext-dom": "*" + "php": ">=5.5" }, "require-dev": { - "phpunit/phpunit": "4.6.*" + "phpunit/phpunit": "^4.8" }, "autoload": { "psr-4": { diff --git a/src/ParserDom.php b/src/ParserDom.php index b14d79f..ac82e63 100644 --- a/src/ParserDom.php +++ b/src/ParserDom.php @@ -19,20 +19,20 @@ class ParserDom { /** * @var array */ - private $_lFind = array(); + private $_lFind = []; /** * @param \DOMNode|string $node * @throws \Exception */ - public function __construct($node = null) { - if ($node !== null) { + public function __construct($node = NULL) { + if ($node !== NULL) { if ($node instanceof \DOMNode) { $this->node = $node; } else { $dom = new \DOMDocument(); - $dom->preserveWhiteSpace = false; - $dom->strictErrorChecking = false; + $dom->preserveWhiteSpace = FALSE; + $dom->strictErrorChecking = FALSE; if (@$dom->loadHTML($node)) { $this->node = $dom; } else { @@ -43,75 +43,46 @@ public function __construct($node = null) { } /** - * @codeCoverageIgnore + * 初始化的时候可以不用传入html,后面可以多次使用 + * @param null $node + * @throws \Exception */ - public function __destruct() { - $this->clearNode($this->node); + public function load($node = NULL) { + if ($node instanceof \DOMNode) { + $this->node = $node; + } else { + $dom = new \DOMDocument(); + $dom->preserveWhiteSpace = FALSE; + $dom->strictErrorChecking = FALSE; + if (@$dom->loadHTML($node)) { + $this->node = $dom; + } else { + throw new \Exception('load html error'); + } + } } /** - * 广度优先查询 - * - * @param string $selector - * @param number $idx 找第几个,从0开始计算,null 表示都返回, 负数表示倒数第几个 - * @return ParserDom|ParserDom[] + * @codeCoverageIgnore + * @param string $name + * @return mixed */ - /*public function findBreadthFirst($selector, $idx = null) { - if (empty($this->node->childNodes)) { - return false; - } - $selectors = $this->parse_selector($selector); - if (($count = count($selectors)) === 0) { - return false; - } - $found = array(); - for ($c = 0; $c < $count; $c++) { - if (($level = count($selectors [$c])) === 0) { - return false; - } - $need_to_search = iterator_to_array($this->node->childNodes); - $search_level = 1; - while (!empty($need_to_search)) { - $temp = array(); - foreach ($need_to_search as $search) { - if ($search_level >= $level) { - $rs = $this->seek($search, $selectors [$c], $level - 1); - if ($rs !== false && $idx !== null) { - if ($idx == count($found)) { - return new self($rs); - } else { - $found[] = new self($rs); - } - } elseif ($rs !== false) { - $found[] = new self($rs); - } - } - $temp[] = $search; - array_shift($need_to_search); - } - foreach ($temp as $temp_val) { - if (!empty($temp_val->childNodes)) { - foreach ($temp_val->childNodes as $val) { - $need_to_search[] = $val; - } - } - } - $search_level++; - } - } - if ($idx !== null) { - if ($idx < 0) { - $idx = count($found) + $idx; - } - if (isset($found[$idx])) { - return $found[$idx]; - } else { - return false; - } + function __get($name) { + switch ($name) { + case 'outertext': + return $this->outerHtml(); + case 'innertext': + return $this->innerHtml(); + case 'plaintext': + return $this->getPlainText(); + case 'href': + return $this->getAttr("href"); + case 'src': + return $this->getAttr("src"); + default: + return NULL; } - return $found; - }*/ - + } /** * 深度优先查询 @@ -120,30 +91,30 @@ public function __destruct() { * @param number $idx 找第几个,从0开始计算,null 表示都返回, 负数表示倒数第几个 * @return self|self[] */ - public function find($selector, $idx = null) { + public function find($selector, $idx = NULL) { if (empty($this->node->childNodes)) { - return false; + return FALSE; } $selectors = $this->parse_selector($selector); if (($count = count($selectors)) === 0) { - return false; + return FALSE; } for ($c = 0; $c < $count; $c++) { if (($level = count($selectors [$c])) === 0) { - return false; + return FALSE; } $this->search($this->node, $idx, $selectors [$c], $level); } $found = $this->_lFind; - $this->_lFind = array(); - if ($idx !== null) { + $this->_lFind = []; + if ($idx !== NULL) { if ($idx < 0) { $idx = count($found) + $idx; } if (isset($found[$idx])) { return $found[$idx]; } else { - return false; + return FALSE; } } return $found; @@ -177,7 +148,7 @@ public function innerHtml() { */ public function outerHtml() { $doc = new \DOMDocument(); - $doc->appendChild($doc->importNode($this->node, true)); + $doc->appendChild($doc->importNode($this->node, TRUE)); return $doc->saveHTML($doc); } @@ -193,7 +164,7 @@ public function getAttr($name) { if (isset($oAttr)) { return $oAttr->nodeValue; } - return null; + return NULL; } /** @@ -222,7 +193,7 @@ private function match($exp, $pattern, $value) { } return preg_match("/" . $pattern . "/i", $value); } - return false; + return FALSE; } /** @@ -232,17 +203,17 @@ private function match($exp, $pattern, $value) { * @return array */ private function parse_selector($selector_string) { - $pattern = '/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)["\']?(.*?)["\']?)?\])?([\/, ]+)/is'; + $pattern = '/([\w\-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)["\']?(.*?)["\']?)?\])?([\/, ]+)/is'; preg_match_all($pattern, trim($selector_string) . ' ', $matches, PREG_SET_ORDER); - $selectors = array(); - $result = array(); + $selectors = []; + $result = []; foreach ($matches as $m) { $m [0] = trim($m [0]); if ($m [0] === '' || $m [0] === '/' || $m [0] === '//') continue; if ($m [1] === 'tbody') continue; - list ($tag, $key, $val, $exp, $no_key) = array($m [1], null, null, '=', false); + list ($tag, $key, $val, $exp, $no_key) = [$m [1], NULL, NULL, '=', FALSE]; if (!empty ($m [2])) { $key = 'id'; $val = $m [2]; @@ -266,12 +237,12 @@ private function parse_selector($selector_string) { // elements that do NOT have the specified attribute if (isset ($key [0]) && $key [0] === '!') { $key = substr($key, 1); - $no_key = true; + $no_key = TRUE; } - $result [] = array($tag, $key, $val, $exp, $no_key); + $result [] = [$tag, $key, $val, $exp, $no_key]; if (trim($m [7]) === ',') { $selectors [] = $result; - $result = array(); + $result = []; } } if (count($result) > 0) { @@ -293,25 +264,25 @@ private function parse_selector($selector_string) { private function search(&$search, $idx, $selectors, $level, $search_level = 0) { if ($search_level >= $level) { $rs = $this->seek($search, $selectors, $level - 1); - if ($rs !== false && $idx !== null) { + if ($rs !== FALSE && $idx !== NULL) { if ($idx == count($this->_lFind)) { $this->_lFind[] = new self($rs); - return true; + return TRUE; } else { $this->_lFind[] = new self($rs); } - } elseif ($rs !== false) { + } elseif ($rs !== FALSE) { $this->_lFind[] = new self($rs); } } if (!empty($search->childNodes)) { foreach ($search->childNodes as $val) { if ($this->search($val, $idx, $selectors, $level, $search_level + 1)) { - return true; + return TRUE; } } } - return false; + return FALSE; } /** @@ -334,24 +305,24 @@ private function text(&$node) { */ private function seek($search, $selectors, $current) { if (!($search instanceof \DOMElement)) { - return false; + return FALSE; } list ($tag, $key, $val, $exp, $no_key) = $selectors [$current]; - $pass = true; + $pass = TRUE; if ($tag === '*' && !$key) { exit('tag为*时,key不能为空'); } if ($tag && $tag != $search->tagName && $tag !== '*') { - $pass = false; + $pass = FALSE; } if ($pass && $key) { if ($no_key) { if ($search->hasAttribute($key)) { - $pass = false; + $pass = FALSE; } } else { if ($key != "plaintext" && !$search->hasAttribute($key)) { - $pass = false; + $pass = FALSE; } } } @@ -373,7 +344,7 @@ private function seek($search, $selectors, $current) { } } if (!$check) { - $pass = false; + $pass = FALSE; } } if ($pass) { @@ -383,10 +354,10 @@ private function seek($search, $selectors, $current) { } elseif ($this->seek($this->getParent($search), $selectors, $current)) { return $search; } else { - return false; + return FALSE; } } else { - return false; + return FALSE; } } @@ -400,21 +371,4 @@ private function getParent($node) { return $node->parentNode; } - /** - * @codeCoverageIgnore - * 释放内存 - * - * @param $node - */ - private function clearNode(&$node) { - if (!empty($node->childNodes)) { - foreach ($node->childNodes as $child) { - $this->clearNode($child); - } - } - unset($node); - } - } - -?> diff --git a/test.php b/test.php index 72d0fba..77d16e5 100644 --- a/test.php +++ b/test.php @@ -1,41 +1,25 @@ -
- -p1
-p2
-p3
-