HTMLからタイトルとbodyのテキストを抽出

FullTextの登録用にHTMLからタイトルとbodyのテキスト抜き出す。XML_HTMLSax でパースして前述のテキストを抜き出すクラス HtmlIndexExtractor を作成した。

<?php
require_once('XML/XML_HTMLSax.php');

interface IndexExtractor {
  function extract($target);
}

class HtmlIndexExtractor implements IndexExtractor {
  private $parser;
  private $handler;
  
  private $title;
  private $content;
  private $str;
  private $getstr;
  
  public function HtmlIndexExtractor() {
    $this->parser=& new XML_HTMLSax();
    
    $this->parser->set_object($this);
    $this->parser->set_option('XML_OPTION_TRIM_DATA_NODES');
    
    $this->parser->set_element_handler('openHandler', 'closeHandler');
    $this->parser->set_data_handler('dataHandler');
  }
  
  public function extract($target) {
    /*
    $orgenc = mb_detect_encoding($target);
    if ($orgenc != "UTF-8") {
      $target = mb_convert_encoding($target, "UTF-8", $orgenc);
    }
    */
    $this->parser->parse($target);
  }
  
  public function getTitle() {
    return $this->title;
  }
  
  public function getContent() {
    return $this->content;
  }
  
  function openHandler(& $parser,$name,$attrs) {
    $tagname = strtolower($name);
    if ($tagname == 'title' || $tagname == 'body') {
      $this->str = "";
      $this->getstr = true;
    }
  }
  
  function closeHandler(& $parser,$name) {
    $tagname = strtolower($name);
    if ($tagname == 'title') {
      $this->title = $this->str;
      $this->getstr = false;
    } elseif ($tagname == 'body') {
      $this->content = $this->str;
      $this->getstr = false;
    }
  }
  
  function dataHandler(& $parser, $data) {
    if ($this->getstr) {
      $this->str .= htmlspecialchars_decode($data);
    }
  }
  
  function escapeHandler(& $parser,$data) {}
  
  function piHandler(& $parser,$target,$data) {}
  
  function jaspHandler(& $parser,$data) {}
}

?>