HTMLからタイトルとbodyのテキストを抽出
FullTextの登録用にHTMLからタイトルとbodyのテキスト抜き出す。XML_HTMLSax でパースして前述のテキストを抜き出すクラス HtmlIndexExtractor を作成した。
<?php require_once('XML/XML_HTMLSax.php'); interface IndexExtractor { function extract($target); } class HtmlIndexExtractor implements IndexExtractor { private $parser; private $handler; private $title; private $content; private $str; private $getstr; public function HtmlIndexExtractor() { $this->parser=& new XML_HTMLSax(); $this->parser->set_object($this); $this->parser->set_option('XML_OPTION_TRIM_DATA_NODES'); $this->parser->set_element_handler('openHandler', 'closeHandler'); $this->parser->set_data_handler('dataHandler'); } public function extract($target) { /* $orgenc = mb_detect_encoding($target); if ($orgenc != "UTF-8") { $target = mb_convert_encoding($target, "UTF-8", $orgenc); } */ $this->parser->parse($target); } public function getTitle() { return $this->title; } public function getContent() { return $this->content; } function openHandler(& $parser,$name,$attrs) { $tagname = strtolower($name); if ($tagname == 'title' || $tagname == 'body') { $this->str = ""; $this->getstr = true; } } function closeHandler(& $parser,$name) { $tagname = strtolower($name); if ($tagname == 'title') { $this->title = $this->str; $this->getstr = false; } elseif ($tagname == 'body') { $this->content = $this->str; $this->getstr = false; } } function dataHandler(& $parser, $data) { if ($this->getstr) { $this->str .= htmlspecialchars_decode($data); } } function escapeHandler(& $parser,$data) {} function piHandler(& $parser,$target,$data) {} function jaspHandler(& $parser,$data) {} } ?>