MeCab + Senna + Tritonn で MySQL 全文検索を試す(2)

MeCab + Senna + Tritonn で MySQL 全文検索を試す - Tosshi Note の続き
Google 風の検索 UI を実装してみた。

テーブルを作成 - SQL

CREATE TABLE fastsearch (
 id INTEGER AUTO_INCREMENT,
 PRIMARY KEY (id),
 uri VARCHAR(512) NOT NULL,
 title VARCHAR(1024) NOT NULL,
 content MEDIUMTEXT,
 FULLTEXT INDEX USING NGRAM, SECTIONALIZE (title, content)
)
DEFAULT CHARSET utf8 ENGINE = MyISAM;

※USING NGRAM をはずせば、MeCab形態素解析になる。

データベース設定 - config.php

<?php
// ** MySQL settings ** //
define('DB_NAME', 'データベース名'); // The name of the database
define('DB_USER', 'ユーザー');      // Your MySQL username
define('DB_PASSWORD', 'パスワード');  // ...and password
define('DB_HOST', 'ホスト名');
define('TABLE_NAME', 'fastsearch');
?>

データベース操作 - db.php

PDO を使って DBを操作する処理を実装する。
find では kwic 関数を使って検索語のマークアップをする。

<?php
require_once("config.php");
  
class DBManager {
  var $table;
  var $dbh;
  var $words;
  var $cond;
  
  public function DBManager() {
    $this->table = TABLE_NAME;
    $this->dbh = new PDO("mysql:host=".DB_HOST.";dbname=".DB_NAME, DB_USER, DB_PASSWORD);
  }
 
  public function insertFullTextIndex($uri, $title, $content) {
    $stmt = $this->dbh->prepare("INSERT INTO ".$this->table
       ." (uri, title, content) VALUES (:uri, :title, :content)");
    $stmt->bindParam(':uri', $uri);
    $stmt->bindParam(':title', $title);
    $stmt->bindParam(':content', $content);
    
    return $stmt->execute();
  }
  
  public function setWords($sw) {
    $sw = mb_ereg_replace("(\s| )+"," ", $sw);
    $sw = mb_ereg_replace("^\s+", "", $sw);
    $sw = mb_ereg_replace("\s+$", "", $sw);
    $sw = mb_ereg_replace("'", "\'", $sw);
    
    $this->words = preg_split("/\s/", $sw);
    $this->cond = "";
    foreach ($this->words as $str){
      if ('-' != substr($str, 0, 1)) {
        $this->cond .= " +".$str;
      } else {
        $this->cond .= " ".$str;
      }
    }
  }
  
  public function hitcount(){
    $stmt = $this->dbh->prepare("SELECT COUNT(*) FROM ".$this->table
       ." WHERE MATCH(title, content) AGAINST ('"
       .$this->cond."' IN BOOLEAN MODE)");
    $stmt->execute();
    
    return $stmt->fetchColumn();
  }
  
  public function find($page=-1, $page_unit=50){
    $kcount = 0;
    $kwic = "kwic(content, 320, 1, 0, \"\", \"...\" ";
    foreach ($this->words as $str){
      if ('-' != substr($str, 0, 1)) {
        $kwic .= ", \"".$str."\", \"<em class='kw".$kcount."'>\", \"</em>\" "; 
        if (++$kcount == 5) $kcount = 0;
      }
    }
    $kwic .= ") ";
    
    if ($page == -1) {
      $pagesql = "";
    } else {
      $pagesql = " LIMIT ". ($page * $page_unit) .",".$page_unit;
    }
    $stmt = $this->dbh->prepare("SELECT uri, title, ".$kwic." as summary FROM "
       .$this->table." WHERE MATCH(title, content) AGAINST ('"
       .$this->cond."' IN BOOLEAN MODE)".$pagesql);
    return $stmt;
  }  
}
?>

検索UI - find.php

db.php を呼び出して Google 風の検索 UI を実現する。

<?php
mb_internal_encoding("UTF-8"); 
mb_regex_encoding("UTF-8");

header('Content-Type:text/html;charset=utf-8');
require("db.php");

$pagecount = 10;
?>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>Fast Search</title>
<link href="style.css" type="text/css" rel="stylesheet" media="screen" />
</head>
<body>
<img src="logo.png" width="517" height="90"/>
<div class="search-form">
<form id="mainForm" method="get" action="./index.php">
  <input type="text" name="find" value="<?php echo($_GET['find']) ?>" style="width:400px"/>
  <input type="submit" value="  検索  "/><br/>
</form>
</div>
<?php
// 検索

function microtime_float() {
    list($usec, $sec) = explode(" ", microtime());
    return ((float)$usec + (float)$sec);
}

$time_start = microtime_float();

if ($_GET['find']) {
  $starttime = microtime_float();
  $dbm = new DBManager();
  $dbm->setWords($_GET['find']);
  $hitcount = $dbm->hitcount();

  $result = "<b>".$_GET['find']."</b> で検索した結果 <b>".$hitcount."</b> 件";

  if ($hitcount == 0) {
?><div id="hitcount"><?php
  $endtime = microtime_float();
  echo $result;
  echo "<b>";
  printf("%.3f", ($endtime - $starttime));
?></b> 秒)</div>
<?php
} else {
  $startnum = 1;
  $currentpage = 0;
  if ($hitcount <= $pagecount) {
    $stmt = $dbm->find();
    $endnum = $hitcount;
  } elseif ($_GET['page']) {
    $startnum = 1 + $_GET['page'] * $pagecount;
    $endnum = $startnum + $pagecount;
    $currentpage = $_GET['page'];
    $stmt = $dbm->find($currentpage, $pagecount);
  } else {
    $endnum = $pagecount;
    $stmt = $dbm->find(0, $pagecount);
  }

  echo "<div id=\"hitcount\">";
  $endtime = microtime_float();
  echo $result;
  echo "中 <b>$startnum</b> - <b>$endnum</b> 件目 (<b>";
  printf("%.3f", ($endtime - $starttime));
  echo "</b> 秒)</div>";

  if ($stmt->execute()) {
    echo "<ol>";
    while ($record = $stmt->fetch(PDO::FETCH_ASSOC)) {
        echo "<li><a href=\"".$record['uri']."\">".$record['title']."</a>\n";
        echo "<div class=\"summary\">";
        echo $record['summary'];
        echo "</div>";
        echo "</li>";
    }
    echo "</ol>";
  } else {
    echo "fail";
  }
}

if ($hitcount > $pagecount) {
?>
<div class="paging-area">
<?php
  $pagen = 0;
  echo "検索結果ページ: <br/>";
  for ($total = 0; $total < $hitcount; $total += $pagecount) {
    if ($currentpage != $pagen) {
      echo "<a href=\"./find.php?find=";
      echo urlencode($_GET['find']);
      echo "&page=$pagen\">";
      echo (++$pagen);
      echo "</a>&nbsp; ";
    } else {
      echo "<b>".++$pagen."</b>&nbsp; ";
    }
    if ($pagen >= 15) {
      echo "...";
      break;
    }
  }
?>
</div>
<?php
}

}
?>
<div class="copyright">Copyright &copy; 2007 Toshimitsu Takahashi</div>
</body>
</html>

総括

以前の自前のMySQL全文検索時より素晴らしく速くなった。
平均 3KB の content を持つレコードを 2 万件登録したが 0.1 秒未満で常に結果が返ってくる。