QueryPath HTML 처리 예제 5 (XPath 목록 보기)

QueryPath HTML 처리 예제 5
XPath 목록 보기

1 예제 코드[ | ]

<?php
require '/var/www/phplib/QueryPath3/qp.php';

function xmp_print($arr) { echo '<xmp>'; print_r($arr); echo '</xmp>'; }

function get_xpath_list($node) {
	$xpath_list = get_raw_xpath_list($node);

	// node마다 유일한 XPath 지정
	$unique = false;
	while( !$unique ) {
		$xpath_cnt = array_count_values($xpath_list);
		foreach($xpath_cnt as $xpath => $cnt) {
			if($cnt < 2) continue;
			$index = 0;
			foreach($xpath_list as $k => $v) {
				if($xpath == $v) {
					$index++;
					$xpath_list[$k] = str_replace($xpath, "{$xpath}[{$index}]", $v);
				}
				else if("$xpath/" == substr($v, 0, strlen($xpath)+1) ) {
					$xpath_list[$k] = str_replace($xpath, "{$xpath}[{$index}]", $v);
				}
			}
		}
		if( count($xpath_list) == count(array_unique($xpath_list)) ) $unique = true;
	}
	return $xpath_list;
}

function my_htmlqp($document = NULL, $selector = NULL, $options = array('convert_to_encoding'=>'UTF-8') ) {
	$document = str_replace(chr(13), '', $document);
	$document = str_replace('&', '__QP_AMPSAND__', $document);
	return htmlqp($document, $selector, $options);
}

function node2arr($node) {
	$arr = array();
	$arr['tag'] = $node->tag();
	$arr['attr'] = array_map("utf8_decode2", $node->attr());
	$arr['innerhtml'] = utf8_decode2($node->innerhtml());
	$arr['html'] = utf8_decode2($node->html());
	return $arr;
}

function utf8_decode2($str) {
	$str = utf8_decode($str);
	$str = str_replace('__QP_AMPSAND__', '&', $str);
	$str = preg_replace_callback('/&#([0-9a-fx]+);/mi', 'replace_num_entity', $str);
	return $str;
}

function replace_num_entity($ord) {
	$ord = $ord[1];
	if (preg_match('/^x([0-9a-f]+)$/i', $ord, $match)) $ord = hexdec($match[1]);
	else $ord = intval($ord);
 
	$no_bytes = 0;
	$byte = array();
 
	if ($ord < 128) return chr($ord);
	elseif ($ord < 2048) $no_bytes = 2;
	elseif ($ord < 65536) $no_bytes = 3;
	elseif ($ord < 1114112) $no_bytes = 4;
	else return;
 
	if ($no_bytes == 2) $prefix = array(31, 192);
	elseif ($no_bytes == 3) $prefix = array(15, 224);
	elseif ($no_bytes == 4) $prefix = array(7, 240);
 
	for ($i = 0; $i < $no_bytes; $i++) $byte[$no_bytes - $i - 1] = (($ord & (63 * pow(2, 6 * $i))) / pow(2, 6 * $i)) & 63 | 128;
	$byte[0] = ($byte[0] & $prefix[0]) | $prefix[1];
 
	$ret = '';
	for ($i = 0; $i < $no_bytes; $i++) $ret .= chr($byte[$i]);
	return utf8_decode($ret);
}

$html = '<!DOCTYPE html>
<html>
<head>
	<title>예제</title>
</head>
<body> 
	<div class="test" id="top">
		<div>
			<div>
				<div>1</div>
				<div>2</div>
			</div>
			<div>
				<div>A</div>
				<div>B</div>
			</div>
		</div>
		<a href="http://jmnote.com" target="_blank" title="JM노트">JM노트</a>
		<div><b>A<b/>&nbsp;&gt;&lt;<br><br><b>B</b></div>
	</div>
</html>';

$body = my_htmlqp($html, 'body');
$xpath_list = get_xpath_list($body);
echo "<h3>XPath 목록</h3>";
xmp_print($xpath_list);

foreach($xpath_list as $xpath) {
	$node = $body->xpath($xpath);
	echo "<h3>$xpath</h3>";
	xmp_print( node2arr($node) );
}

2 같이 보기[ | ]

문서 댓글 ({{ doc_comments.length }})
{{ comment.name }} {{ comment.created | snstime }}