[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Search/ -> Indexer.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Search;
   4  
   5  use dokuwiki\Extension\Event;
   6  
   7  /**
   8   * Class that encapsulates operations on the indexer database.
   9   *
  10   * @author Tom N Harris <tnharris@whoopdedo.org>
  11   */
  12  class Indexer {
  13      /**
  14       * @var array $pidCache Cache for getPID()
  15       */
  16      protected $pidCache = array();
  17  
  18      /**
  19       * Adds the contents of a page to the fulltext index
  20       *
  21       * The added text replaces previous words for the same page.
  22       * An empty value erases the page.
  23       *
  24       * @param string    $page   a page name
  25       * @param string    $text   the body of the page
  26       * @return string|boolean  the function completed successfully
  27       *
  28       * @author Tom N Harris <tnharris@whoopdedo.org>
  29       * @author Andreas Gohr <andi@splitbrain.org>
  30       */
  31      public function addPageWords($page, $text) {
  32          if (!$this->lock())
  33              return "locked";
  34  
  35          // load known documents
  36          $pid = $this->getPIDNoLock($page);
  37          if ($pid === false) {
  38              $this->unlock();
  39              return false;
  40          }
  41  
  42          $pagewords = array();
  43          // get word usage in page
  44          $words = $this->getPageWords($text);
  45          if ($words === false) {
  46              $this->unlock();
  47              return false;
  48          }
  49  
  50          if (!empty($words)) {
  51              foreach (array_keys($words) as $wlen) {
  52                  $index = $this->getIndex('i', $wlen);
  53                  foreach ($words[$wlen] as $wid => $freq) {
  54                      $idx = ($wid<count($index)) ? $index[$wid] : '';
  55                      $index[$wid] = $this->updateTuple($idx, $pid, $freq);
  56                      $pagewords[] = "$wlen*$wid";
  57                  }
  58                  if (!$this->saveIndex('i', $wlen, $index)) {
  59                      $this->unlock();
  60                      return false;
  61                  }
  62              }
  63          }
  64  
  65          // Remove obsolete index entries
  66          $pageword_idx = $this->getIndexKey('pageword', '', $pid);
  67          if ($pageword_idx !== '') {
  68              $oldwords = explode(':',$pageword_idx);
  69              $delwords = array_diff($oldwords, $pagewords);
  70              $upwords = array();
  71              foreach ($delwords as $word) {
  72                  if ($word != '') {
  73                      list($wlen, $wid) = explode('*', $word);
  74                      $wid = (int)$wid;
  75                      $upwords[$wlen][] = $wid;
  76                  }
  77              }
  78              foreach ($upwords as $wlen => $widx) {
  79                  $index = $this->getIndex('i', $wlen);
  80                  foreach ($widx as $wid) {
  81                      $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
  82                  }
  83                  $this->saveIndex('i', $wlen, $index);
  84              }
  85          }
  86          // Save the reverse index
  87          $pageword_idx = join(':', $pagewords);
  88          if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
  89              $this->unlock();
  90              return false;
  91          }
  92  
  93          $this->unlock();
  94          return true;
  95      }
  96  
  97      /**
  98       * Split the words in a page and add them to the index.
  99       *
 100       * @param string    $text   content of the page
 101       * @return array            list of word IDs and number of times used
 102       *
 103       * @author Andreas Gohr <andi@splitbrain.org>
 104       * @author Christopher Smith <chris@jalakai.co.uk>
 105       * @author Tom N Harris <tnharris@whoopdedo.org>
 106       */
 107      protected function getPageWords($text) {
 108  
 109          $tokens = $this->tokenizer($text);
 110          $tokens = array_count_values($tokens);  // count the frequency of each token
 111  
 112          $words = array();
 113          foreach ($tokens as $w=>$c) {
 114              $l = wordlen($w);
 115              if (isset($words[$l])){
 116                  $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
 117              }else{
 118                  $words[$l] = array($w => $c);
 119              }
 120          }
 121  
 122          // arrive here with $words = array(wordlen => array(word => frequency))
 123          $index = array();   //resulting index
 124          foreach (array_keys($words) as $wlen) {
 125              $word_idx = $this->getIndex('w', $wlen);
 126              $word_idx_modified = false;
 127              foreach ($words[$wlen] as $word => $freq) {
 128                  $word = (string)$word;
 129                  $wid = array_search($word, $word_idx, true);
 130                  if ($wid === false) {
 131                      $wid = count($word_idx);
 132                      $word_idx[] = $word;
 133                      $word_idx_modified = true;
 134                  }
 135                  if (!isset($index[$wlen]))
 136                      $index[$wlen] = array();
 137                  $index[$wlen][$wid] = $freq;
 138              }
 139              // save back the word index
 140              if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx))
 141                  return false;
 142          }
 143  
 144          return $index;
 145      }
 146  
 147      /**
 148       * Add/update keys to/of the metadata index.
 149       *
 150       * Adding new keys does not remove other keys for the page.
 151       * An empty value will erase the key.
 152       * The $key parameter can be an array to add multiple keys. $value will
 153       * not be used if $key is an array.
 154       *
 155       * @param string    $page   a page name
 156       * @param mixed     $key    a key string or array of key=>value pairs
 157       * @param mixed     $value  the value or list of values
 158       * @return boolean|string     the function completed successfully
 159       *
 160       * @author Tom N Harris <tnharris@whoopdedo.org>
 161       * @author Michael Hamann <michael@content-space.de>
 162       */
 163      public function addMetaKeys($page, $key, $value=null) {
 164          if (!is_array($key)) {
 165              $key = array($key => $value);
 166          } elseif (!is_null($value)) {
 167              // $key is array, but $value is not null
 168              trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
 169          }
 170  
 171          if (!$this->lock())
 172              return "locked";
 173  
 174          // load known documents
 175          $pid = $this->getPIDNoLock($page);
 176          if ($pid === false) {
 177              $this->unlock();
 178              return false;
 179          }
 180  
 181          // Special handling for titles so the index file is simpler
 182          if (isset($key['title'])) {
 183              $value = $key['title'];
 184              if (is_array($value)) {
 185                  $value = $value[0];
 186              }
 187              $this->saveIndexKey('title', '', $pid, $value);
 188              unset($key['title']);
 189          }
 190  
 191          foreach ($key as $name => $values) {
 192              $metaname = idx_cleanName($name);
 193              $this->addIndexKey('metadata', '', $metaname);
 194              $metaidx = $this->getIndex($metaname.'_i', '');
 195              $metawords = $this->getIndex($metaname.'_w', '');
 196              $addwords = false;
 197  
 198              if (!is_array($values)) $values = array($values);
 199  
 200              $val_idx = $this->getIndexKey($metaname.'_p', '', $pid);
 201              if ($val_idx !== '') {
 202                  $val_idx = explode(':', $val_idx);
 203                  // -1 means remove, 0 keep, 1 add
 204                  $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
 205              } else {
 206                  $val_idx = array();
 207              }
 208  
 209              foreach ($values as $val) {
 210                  $val = (string)$val;
 211                  if ($val !== "") {
 212                      $id = array_search($val, $metawords, true);
 213                      if ($id === false) {
 214                          // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
 215                          $id = count($metawords);
 216                          $metawords[$id] = $val;
 217                          $metaidx[$id] = '';
 218                          $addwords = true;
 219                      }
 220                      // test if value is already in the index
 221                      if (isset($val_idx[$id]) && $val_idx[$id] <= 0){
 222                          $val_idx[$id] = 0;
 223                      } else { // else add it
 224                          $val_idx[$id] = 1;
 225                      }
 226                  }
 227              }
 228  
 229              if ($addwords) {
 230                  $this->saveIndex($metaname.'_w', '', $metawords);
 231              }
 232              $vals_changed = false;
 233              foreach ($val_idx as $id => $action) {
 234                  if ($action == -1) {
 235                      $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
 236                      $vals_changed = true;
 237                      unset($val_idx[$id]);
 238                  } elseif ($action == 1) {
 239                      $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
 240                      $vals_changed = true;
 241                  }
 242              }
 243  
 244              if ($vals_changed) {
 245                  $this->saveIndex($metaname.'_i', '', $metaidx);
 246                  $val_idx = implode(':', array_keys($val_idx));
 247                  $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx);
 248              }
 249  
 250              unset($metaidx);
 251              unset($metawords);
 252          }
 253  
 254          $this->unlock();
 255          return true;
 256      }
 257  
 258      /**
 259       * Rename a page in the search index without changing the indexed content. This function doesn't check if the
 260       * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the
 261       * indexer and it deletes all previously indexed content of the new page.
 262       *
 263       * @param string $oldpage The old page name
 264       * @param string $newpage The new page name
 265       * @return string|bool If the page was successfully renamed, can be a message in the case of an error
 266       */
 267      public function renamePage($oldpage, $newpage) {
 268          if (!$this->lock()) return 'locked';
 269  
 270          $pages = $this->getPages();
 271  
 272          $id = array_search($oldpage, $pages, true);
 273          if ($id === false) {
 274              $this->unlock();
 275              return 'page is not in index';
 276          }
 277  
 278          $new_id = array_search($newpage, $pages, true);
 279          if ($new_id !== false) {
 280              // make sure the page is not in the index anymore
 281              if ($this->deletePageNoLock($newpage) !== true) {
 282                  return false;
 283              }
 284  
 285              $pages[$new_id] = 'deleted:'.time().rand(0, 9999);
 286          }
 287  
 288          $pages[$id] = $newpage;
 289  
 290          // update index
 291          if (!$this->saveIndex('page', '', $pages)) {
 292              $this->unlock();
 293              return false;
 294          }
 295  
 296          // reset the pid cache
 297          $this->pidCache = array();
 298  
 299          $this->unlock();
 300          return true;
 301      }
 302  
 303      /**
 304       * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages
 305       * will be updated.
 306       *
 307       * @param string $key       The metadata key of which a value shall be changed
 308       * @param string $oldvalue  The old value that shall be renamed
 309       * @param string $newvalue  The new value to which the old value shall be renamed, if exists values will be merged
 310       * @return bool|string      If renaming the value has been successful, false or error message on error.
 311       */
 312      public function renameMetaValue($key, $oldvalue, $newvalue) {
 313          if (!$this->lock()) return 'locked';
 314  
 315          // change the relation references index
 316          $metavalues = $this->getIndex($key, '_w');
 317          $oldid = array_search($oldvalue, $metavalues, true);
 318          if ($oldid !== false) {
 319              $newid = array_search($newvalue, $metavalues, true);
 320              if ($newid !== false) {
 321                  // free memory
 322                  unset ($metavalues);
 323  
 324                  // okay, now we have two entries for the same value. we need to merge them.
 325                  $indexline = $this->getIndexKey($key.'_i', '', $oldid);
 326                  if ($indexline != '') {
 327                      $newindexline = $this->getIndexKey($key.'_i', '', $newid);
 328                      $pagekeys     = $this->getIndex($key.'_p', '');
 329                      $parts = explode(':', $indexline);
 330                      foreach ($parts as $part) {
 331                          list($id, $count) = explode('*', $part);
 332                          $newindexline =  $this->updateTuple($newindexline, $id, $count);
 333  
 334                          $keyline = explode(':', $pagekeys[$id]);
 335                          // remove old meta value
 336                          $keyline = array_diff($keyline, array($oldid));
 337                          // add new meta value when not already present
 338                          if (!in_array($newid, $keyline)) {
 339                              array_push($keyline, $newid);
 340                          }
 341                          $pagekeys[$id] = implode(':', $keyline);
 342                      }
 343                      $this->saveIndex($key.'_p', '', $pagekeys);
 344                      unset($pagekeys);
 345                      $this->saveIndexKey($key.'_i', '', $oldid, '');
 346                      $this->saveIndexKey($key.'_i', '', $newid, $newindexline);
 347                  }
 348              } else {
 349                  $metavalues[$oldid] = $newvalue;
 350                  if (!$this->saveIndex($key.'_w', '', $metavalues)) {
 351                      $this->unlock();
 352                      return false;
 353                  }
 354              }
 355          }
 356  
 357          $this->unlock();
 358          return true;
 359      }
 360  
 361      /**
 362       * Remove a page from the index
 363       *
 364       * Erases entries in all known indexes.
 365       *
 366       * @param string    $page   a page name
 367       * @return string|boolean  the function completed successfully
 368       *
 369       * @author Tom N Harris <tnharris@whoopdedo.org>
 370       */
 371      public function deletePage($page) {
 372          if (!$this->lock())
 373              return "locked";
 374  
 375          $result = $this->deletePageNoLock($page);
 376  
 377          $this->unlock();
 378  
 379          return $result;
 380      }
 381  
 382      /**
 383       * Remove a page from the index without locking the index, only use this function if the index is already locked
 384       *
 385       * Erases entries in all known indexes.
 386       *
 387       * @param string    $page   a page name
 388       * @return boolean          the function completed successfully
 389       *
 390       * @author Tom N Harris <tnharris@whoopdedo.org>
 391       */
 392      protected function deletePageNoLock($page) {
 393          // load known documents
 394          $pid = $this->getPIDNoLock($page);
 395          if ($pid === false) {
 396              return false;
 397          }
 398  
 399          // Remove obsolete index entries
 400          $pageword_idx = $this->getIndexKey('pageword', '', $pid);
 401          if ($pageword_idx !== '') {
 402              $delwords = explode(':',$pageword_idx);
 403              $upwords = array();
 404              foreach ($delwords as $word) {
 405                  if ($word != '') {
 406                      list($wlen,$wid) = explode('*', $word);
 407                      $wid = (int)$wid;
 408                      $upwords[$wlen][] = $wid;
 409                  }
 410              }
 411              foreach ($upwords as $wlen => $widx) {
 412                  $index = $this->getIndex('i', $wlen);
 413                  foreach ($widx as $wid) {
 414                      $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
 415                  }
 416                  $this->saveIndex('i', $wlen, $index);
 417              }
 418          }
 419          // Save the reverse index
 420          if (!$this->saveIndexKey('pageword', '', $pid, "")) {
 421              return false;
 422          }
 423  
 424          $this->saveIndexKey('title', '', $pid, "");
 425          $keyidx = $this->getIndex('metadata', '');
 426          foreach ($keyidx as $metaname) {
 427              $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid));
 428              $meta_idx = $this->getIndex($metaname.'_i', '');
 429              foreach ($val_idx as $id) {
 430                  if ($id === '') continue;
 431                  $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
 432              }
 433              $this->saveIndex($metaname.'_i', '', $meta_idx);
 434              $this->saveIndexKey($metaname.'_p', '', $pid, '');
 435          }
 436  
 437          return true;
 438      }
 439  
 440      /**
 441       * Clear the whole index
 442       *
 443       * @return bool If the index has been cleared successfully
 444       */
 445      public function clear() {
 446          global $conf;
 447  
 448          if (!$this->lock()) return false;
 449  
 450          @unlink($conf['indexdir'].'/page.idx');
 451          @unlink($conf['indexdir'].'/title.idx');
 452          @unlink($conf['indexdir'].'/pageword.idx');
 453          @unlink($conf['indexdir'].'/metadata.idx');
 454          $dir = @opendir($conf['indexdir']);
 455          if($dir!==false){
 456              while(($f = readdir($dir)) !== false){
 457                  if(substr($f,-4)=='.idx' &&
 458                      (substr($f,0,1)=='i' || substr($f,0,1)=='w'
 459                          || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx'))
 460                      @unlink($conf['indexdir']."/$f");
 461              }
 462          }
 463          @unlink($conf['indexdir'].'/lengths.idx');
 464  
 465          // clear the pid cache
 466          $this->pidCache = array();
 467  
 468          $this->unlock();
 469          return true;
 470      }
 471  
 472      /**
 473       * Split the text into words for fulltext search
 474       *
 475       * TODO: does this also need &$stopwords ?
 476       *
 477       * @triggers INDEXER_TEXT_PREPARE
 478       * This event allows plugins to modify the text before it gets tokenized.
 479       * Plugins intercepting this event should also intercept INDEX_VERSION_GET
 480       *
 481       * @param string    $text   plain text
 482       * @param boolean   $wc     are wildcards allowed?
 483       * @return array            list of words in the text
 484       *
 485       * @author Tom N Harris <tnharris@whoopdedo.org>
 486       * @author Andreas Gohr <andi@splitbrain.org>
 487       */
 488      public function tokenizer($text, $wc=false) {
 489          $wc = ($wc) ? '' : '\*';
 490          $stopwords =& idx_get_stopwords();
 491  
 492          // prepare the text to be tokenized
 493          $evt = new Event('INDEXER_TEXT_PREPARE', $text);
 494          if ($evt->advise_before(true)) {
 495              if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
 496                  $text = \dokuwiki\Utf8\Asian::separateAsianWords($text);
 497              }
 498          }
 499          $evt->advise_after();
 500          unset($evt);
 501  
 502          $text = strtr($text,
 503                        array(
 504                            "\r" => ' ',
 505                            "\n" => ' ',
 506                            "\t" => ' ',
 507                            "\xC2\xAD" => '', //soft-hyphen
 508                        )
 509          );
 510          if (preg_match('/[^0-9A-Za-z ]/u', $text))
 511              $text = \dokuwiki\Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
 512  
 513          $wordlist = explode(' ', $text);
 514          foreach ($wordlist as $i => $word) {
 515              $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
 516                  \dokuwiki\Utf8\PhpString::strtolower($word) : strtolower($word);
 517          }
 518  
 519          foreach ($wordlist as $i => $word) {
 520              if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
 521                  || array_search($word, $stopwords, true) !== false)
 522                  unset($wordlist[$i]);
 523          }
 524          return array_values($wordlist);
 525      }
 526  
 527      /**
 528       * Get the numeric PID of a page
 529       *
 530       * @param string $page The page to get the PID for
 531       * @return bool|int The page id on success, false on error
 532       */
 533      public function getPID($page) {
 534          // return PID without locking when it is in the cache
 535          if (isset($this->pidCache[$page])) return $this->pidCache[$page];
 536  
 537          if (!$this->lock())
 538              return false;
 539  
 540          // load known documents
 541          $pid = $this->getPIDNoLock($page);
 542          if ($pid === false) {
 543              $this->unlock();
 544              return false;
 545          }
 546  
 547          $this->unlock();
 548          return $pid;
 549      }
 550  
 551      /**
 552       * Get the numeric PID of a page without locking the index.
 553       * Only use this function when the index is already locked.
 554       *
 555       * @param string $page The page to get the PID for
 556       * @return bool|int The page id on success, false on error
 557       */
 558      protected function getPIDNoLock($page) {
 559          // avoid expensive addIndexKey operation for the most recently requested pages by using a cache
 560          if (isset($this->pidCache[$page])) return $this->pidCache[$page];
 561          $pid = $this->addIndexKey('page', '', $page);
 562          // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently
 563          // added item will be requested again
 564          if (count($this->pidCache) > 10) array_shift($this->pidCache);
 565          $this->pidCache[$page] = $pid;
 566          return $pid;
 567      }
 568  
 569      /**
 570       * Get the page id of a numeric PID
 571       *
 572       * @param int $pid The PID to get the page id for
 573       * @return string The page id
 574       */
 575      public function getPageFromPID($pid) {
 576          return $this->getIndexKey('page', '', $pid);
 577      }
 578  
 579      /**
 580       * Find pages in the fulltext index containing the words,
 581       *
 582       * The search words must be pre-tokenized, meaning only letters and
 583       * numbers with an optional wildcard
 584       *
 585       * The returned array will have the original tokens as key. The values
 586       * in the returned list is an array with the page names as keys and the
 587       * number of times that token appears on the page as value.
 588       *
 589       * @param array  $tokens list of words to search for
 590       * @return array         list of page names with usage counts
 591       *
 592       * @author Tom N Harris <tnharris@whoopdedo.org>
 593       * @author Andreas Gohr <andi@splitbrain.org>
 594       */
 595      public function lookup(&$tokens) {
 596          $result = array();
 597          $wids = $this->getIndexWords($tokens, $result);
 598          if (empty($wids)) return array();
 599          // load known words and documents
 600          $page_idx = $this->getIndex('page', '');
 601          $docs = array();
 602          foreach (array_keys($wids) as $wlen) {
 603              $wids[$wlen] = array_unique($wids[$wlen]);
 604              $index = $this->getIndex('i', $wlen);
 605              foreach($wids[$wlen] as $ixid) {
 606                  if ($ixid < count($index))
 607                      $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
 608              }
 609          }
 610          // merge found pages into final result array
 611          $final = array();
 612          foreach ($result as $word => $res) {
 613              $final[$word] = array();
 614              foreach ($res as $wid) {
 615                  // handle the case when ($ixid < count($index)) has been false
 616                  // and thus $docs[$wid] hasn't been set.
 617                  if (!isset($docs[$wid])) continue;
 618                  $hits = &$docs[$wid];
 619                  foreach ($hits as $hitkey => $hitcnt) {
 620                      // make sure the document still exists
 621                      if (!page_exists($hitkey, '', false)) continue;
 622                      if (!isset($final[$word][$hitkey]))
 623                          $final[$word][$hitkey] = $hitcnt;
 624                      else
 625                          $final[$word][$hitkey] += $hitcnt;
 626                  }
 627              }
 628          }
 629          return $final;
 630      }
 631  
 632      /**
 633       * Find pages containing a metadata key.
 634       *
 635       * The metadata values are compared as case-sensitive strings. Pass a
 636       * callback function that returns true or false to use a different
 637       * comparison function. The function will be called with the $value being
 638       * searched for as the first argument, and the word in the index as the
 639       * second argument. The function preg_match can be used directly if the
 640       * values are regexes.
 641       *
 642       * @param string    $key    name of the metadata key to look for
 643       * @param string    $value  search term to look for, must be a string or array of strings
 644       * @param callback  $func   comparison function
 645       * @return array            lists with page names, keys are query values if $value is array
 646       *
 647       * @author Tom N Harris <tnharris@whoopdedo.org>
 648       * @author Michael Hamann <michael@content-space.de>
 649       */
 650      public function lookupKey($key, &$value, $func=null) {
 651          if (!is_array($value))
 652              $value_array = array($value);
 653          else
 654              $value_array =& $value;
 655  
 656          // the matching ids for the provided value(s)
 657          $value_ids = array();
 658  
 659          $metaname = idx_cleanName($key);
 660  
 661          // get all words in order to search the matching ids
 662          if ($key == 'title') {
 663              $words = $this->getIndex('title', '');
 664          } else {
 665              $words = $this->getIndex($metaname.'_w', '');
 666          }
 667  
 668          if (!is_null($func)) {
 669              foreach ($value_array as $val) {
 670                  foreach ($words as $i => $word) {
 671                      if (call_user_func_array($func, array($val, $word)))
 672                          $value_ids[$i][] = $val;
 673                  }
 674              }
 675          } else {
 676              foreach ($value_array as $val) {
 677                  $xval = $val;
 678                  $caret = '^';
 679                  $dollar = '$';
 680                  // check for wildcards
 681                  if (substr($xval, 0, 1) == '*') {
 682                      $xval = substr($xval, 1);
 683                      $caret = '';
 684                  }
 685                  if (substr($xval, -1, 1) == '*') {
 686                      $xval = substr($xval, 0, -1);
 687                      $dollar = '';
 688                  }
 689                  if (!$caret || !$dollar) {
 690                      $re = $caret.preg_quote($xval, '/').$dollar;
 691                      foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i)
 692                          $value_ids[$i][] = $val;
 693                  } else {
 694                      if (($i = array_search($val, $words, true)) !== false)
 695                          $value_ids[$i][] = $val;
 696                  }
 697              }
 698          }
 699  
 700          unset($words); // free the used memory
 701  
 702          // initialize the result so it won't be null
 703          $result = array();
 704          foreach ($value_array as $val) {
 705              $result[$val] = array();
 706          }
 707  
 708          $page_idx = $this->getIndex('page', '');
 709  
 710          // Special handling for titles
 711          if ($key == 'title') {
 712              foreach ($value_ids as $pid => $val_list) {
 713                  $page = $page_idx[$pid];
 714                  foreach ($val_list as $val) {
 715                      $result[$val][] = $page;
 716                  }
 717              }
 718          } else {
 719              // load all lines and pages so the used lines can be taken and matched with the pages
 720              $lines = $this->getIndex($metaname.'_i', '');
 721  
 722              foreach ($value_ids as $value_id => $val_list) {
 723                  // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
 724                  // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
 725                  $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
 726                  foreach ($val_list as $val) {
 727                      $result[$val] = array_merge($result[$val], $pages);
 728                  }
 729              }
 730          }
 731          if (!is_array($value)) $result = $result[$value];
 732          return $result;
 733      }
 734  
 735      /**
 736       * Find the index ID of each search term.
 737       *
 738       * The query terms should only contain valid characters, with a '*' at
 739       * either the beginning or end of the word (or both).
 740       * The $result parameter can be used to merge the index locations with
 741       * the appropriate query term.
 742       *
 743       * @param array  $words  The query terms.
 744       * @param array  $result Set to word => array("length*id" ...)
 745       * @return array         Set to length => array(id ...)
 746       *
 747       * @author Tom N Harris <tnharris@whoopdedo.org>
 748       */
 749      protected function getIndexWords(&$words, &$result) {
 750          $tokens = array();
 751          $tokenlength = array();
 752          $tokenwild = array();
 753          foreach ($words as $word) {
 754              $result[$word] = array();
 755              $caret = '^';
 756              $dollar = '$';
 757              $xword = $word;
 758              $wlen = wordlen($word);
 759  
 760              // check for wildcards
 761              if (substr($xword, 0, 1) == '*') {
 762                  $xword = substr($xword, 1);
 763                  $caret = '';
 764                  $wlen -= 1;
 765              }
 766              if (substr($xword, -1, 1) == '*') {
 767                  $xword = substr($xword, 0, -1);
 768                  $dollar = '';
 769                  $wlen -= 1;
 770              }
 771              if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword))
 772                  continue;
 773              if (!isset($tokens[$xword]))
 774                  $tokenlength[$wlen][] = $xword;
 775              if (!$caret || !$dollar) {
 776                  $re = $caret.preg_quote($xword, '/').$dollar;
 777                  $tokens[$xword][] = array($word, '/'.$re.'/');
 778                  if (!isset($tokenwild[$xword]))
 779                      $tokenwild[$xword] = $wlen;
 780              } else {
 781                  $tokens[$xword][] = array($word, null);
 782              }
 783          }
 784          asort($tokenwild);
 785          // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
 786          // $tokenlength = array( base word length => base word ... )
 787          // $tokenwild = array( base word => base word length ... )
 788          $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
 789          $indexes_known = $this->indexLengths($length_filter);
 790          if (!empty($tokenwild)) sort($indexes_known);
 791          // get word IDs
 792          $wids = array();
 793          foreach ($indexes_known as $ixlen) {
 794              $word_idx = $this->getIndex('w', $ixlen);
 795              // handle exact search
 796              if (isset($tokenlength[$ixlen])) {
 797                  foreach ($tokenlength[$ixlen] as $xword) {
 798                      $wid = array_search($xword, $word_idx, true);
 799                      if ($wid !== false) {
 800                          $wids[$ixlen][] = $wid;
 801                          foreach ($tokens[$xword] as $w)
 802                              $result[$w[0]][] = "$ixlen*$wid";
 803                      }
 804                  }
 805              }
 806              // handle wildcard search
 807              foreach ($tokenwild as $xword => $wlen) {
 808                  if ($wlen >= $ixlen) break;
 809                  foreach ($tokens[$xword] as $w) {
 810                      if (is_null($w[1])) continue;
 811                      foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) {
 812                          $wids[$ixlen][] = $wid;
 813                          $result[$w[0]][] = "$ixlen*$wid";
 814                      }
 815                  }
 816              }
 817          }
 818          return $wids;
 819      }
 820  
 821      /**
 822       * Return a list of all pages
 823       * Warning: pages may not exist!
 824       *
 825       * @param string    $key    list only pages containing the metadata key (optional)
 826       * @return array            list of page names
 827       *
 828       * @author Tom N Harris <tnharris@whoopdedo.org>
 829       */
 830      public function getPages($key=null) {
 831          $page_idx = $this->getIndex('page', '');
 832          if (is_null($key)) return $page_idx;
 833  
 834          $metaname = idx_cleanName($key);
 835  
 836          // Special handling for titles
 837          if ($key == 'title') {
 838              $title_idx = $this->getIndex('title', '');
 839              array_splice($page_idx, count($title_idx));
 840              foreach ($title_idx as $i => $title)
 841                  if ($title === "") unset($page_idx[$i]);
 842              return array_values($page_idx);
 843          }
 844  
 845          $pages = array();
 846          $lines = $this->getIndex($metaname.'_i', '');
 847          foreach ($lines as $line) {
 848              $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
 849          }
 850          return array_keys($pages);
 851      }
 852  
 853      /**
 854       * Return a list of words sorted by number of times used
 855       *
 856       * @param int       $min    bottom frequency threshold
 857       * @param int       $max    upper frequency limit. No limit if $max<$min
 858       * @param int       $minlen minimum length of words to count
 859       * @param string    $key    metadata key to list. Uses the fulltext index if not given
 860       * @return array            list of words as the keys and frequency as values
 861       *
 862       * @author Tom N Harris <tnharris@whoopdedo.org>
 863       */
 864      public function histogram($min=1, $max=0, $minlen=3, $key=null) {
 865          if ($min < 1)
 866              $min = 1;
 867          if ($max < $min)
 868              $max = 0;
 869  
 870          $result = array();
 871  
 872          if ($key == 'title') {
 873              $index = $this->getIndex('title', '');
 874              $index = array_count_values($index);
 875              foreach ($index as $val => $cnt) {
 876                  if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen)
 877                      $result[$val] = $cnt;
 878              }
 879          }
 880          elseif (!is_null($key)) {
 881              $metaname = idx_cleanName($key);
 882              $index = $this->getIndex($metaname.'_i', '');
 883              $val_idx = array();
 884              foreach ($index as $wid => $line) {
 885                  $freq = $this->countTuples($line);
 886                  if ($freq >= $min && (!$max || $freq <= $max))
 887                      $val_idx[$wid] = $freq;
 888              }
 889              if (!empty($val_idx)) {
 890                  $words = $this->getIndex($metaname.'_w', '');
 891                  foreach ($val_idx as $wid => $freq) {
 892                      if (strlen($words[$wid]) >= $minlen)
 893                          $result[$words[$wid]] = $freq;
 894                  }
 895              }
 896          }
 897          else {
 898              $lengths = idx_listIndexLengths();
 899              foreach ($lengths as $length) {
 900                  if ($length < $minlen) continue;
 901                  $index = $this->getIndex('i', $length);
 902                  $words = null;
 903                  foreach ($index as $wid => $line) {
 904                      $freq = $this->countTuples($line);
 905                      if ($freq >= $min && (!$max || $freq <= $max)) {
 906                          if ($words === null)
 907                              $words = $this->getIndex('w', $length);
 908                          $result[$words[$wid]] = $freq;
 909                      }
 910                  }
 911              }
 912          }
 913  
 914          arsort($result);
 915          return $result;
 916      }
 917  
 918      /**
 919       * Lock the indexer.
 920       *
 921       * @author Tom N Harris <tnharris@whoopdedo.org>
 922       *
 923       * @return bool|string
 924       */
 925      protected function lock() {
 926          global $conf;
 927          $status = true;
 928          $run = 0;
 929          $lock = $conf['lockdir'].'/_indexer.lock';
 930          while (!@mkdir($lock)) {
 931              usleep(50);
 932              if(is_dir($lock) && time()-@filemtime($lock) > 60*5){
 933                  // looks like a stale lock - remove it
 934                  if (!@rmdir($lock)) {
 935                      $status = "removing the stale lock failed";
 936                      return false;
 937                  } else {
 938                      $status = "stale lock removed";
 939                  }
 940              }elseif($run++ == 1000){
 941                  // we waited 5 seconds for that lock
 942                  return false;
 943              }
 944          }
 945          if ($conf['dperm']) {
 946              chmod($lock, $conf['dperm']);
 947          }
 948          return $status;
 949      }
 950  
 951      /**
 952       * Release the indexer lock.
 953       *
 954       * @author Tom N Harris <tnharris@whoopdedo.org>
 955       *
 956       * @return bool
 957       */
 958      protected function unlock() {
 959          global $conf;
 960          @rmdir($conf['lockdir'].'/_indexer.lock');
 961          return true;
 962      }
 963  
 964      /**
 965       * Retrieve the entire index.
 966       *
 967       * The $suffix argument is for an index that is split into
 968       * multiple parts. Different index files should use different
 969       * base names.
 970       *
 971       * @param string    $idx    name of the index
 972       * @param string    $suffix subpart identifier
 973       * @return array            list of lines without CR or LF
 974       *
 975       * @author Tom N Harris <tnharris@whoopdedo.org>
 976       */
 977      protected function getIndex($idx, $suffix) {
 978          global $conf;
 979          $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
 980          if (!file_exists($fn)) return array();
 981          return file($fn, FILE_IGNORE_NEW_LINES);
 982      }
 983  
 984      /**
 985       * Replace the contents of the index with an array.
 986       *
 987       * @param string    $idx    name of the index
 988       * @param string    $suffix subpart identifier
 989       * @param array     $lines  list of lines without LF
 990       * @return bool             If saving succeeded
 991       *
 992       * @author Tom N Harris <tnharris@whoopdedo.org>
 993       */
 994      protected function saveIndex($idx, $suffix, &$lines) {
 995          global $conf;
 996          $fn = $conf['indexdir'].'/'.$idx.$suffix;
 997          $fh = @fopen($fn.'.tmp', 'w');
 998          if (!$fh) return false;
 999          fwrite($fh, join("\n", $lines));
1000          if (!empty($lines))
1001              fwrite($fh, "\n");
1002          fclose($fh);
1003          if ($conf['fperm'])
1004              chmod($fn.'.tmp', $conf['fperm']);
1005          io_rename($fn.'.tmp', $fn.'.idx');
1006          return true;
1007      }
1008  
1009      /**
1010       * Retrieve a line from the index.
1011       *
1012       * @param string    $idx    name of the index
1013       * @param string    $suffix subpart identifier
1014       * @param int       $id     the line number
1015       * @return string           a line with trailing whitespace removed
1016       *
1017       * @author Tom N Harris <tnharris@whoopdedo.org>
1018       */
1019      protected function getIndexKey($idx, $suffix, $id) {
1020          global $conf;
1021          $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
1022          if (!file_exists($fn)) return '';
1023          $fh = @fopen($fn, 'r');
1024          if (!$fh) return '';
1025          $ln = -1;
1026          while (($line = fgets($fh)) !== false) {
1027              if (++$ln == $id) break;
1028          }
1029          fclose($fh);
1030          return rtrim((string)$line);
1031      }
1032  
1033      /**
1034       * Write a line into the index.
1035       *
1036       * @param string    $idx    name of the index
1037       * @param string    $suffix subpart identifier
1038       * @param int       $id     the line number
1039       * @param string    $line   line to write
1040       * @return bool             If saving succeeded
1041       *
1042       * @author Tom N Harris <tnharris@whoopdedo.org>
1043       */
1044      protected function saveIndexKey($idx, $suffix, $id, $line) {
1045          global $conf;
1046          if (substr($line, -1) != "\n")
1047              $line .= "\n";
1048          $fn = $conf['indexdir'].'/'.$idx.$suffix;
1049          $fh = @fopen($fn.'.tmp', 'w');
1050          if (!$fh) return false;
1051          $ih = @fopen($fn.'.idx', 'r');
1052          if ($ih) {
1053              $ln = -1;
1054              while (($curline = fgets($ih)) !== false) {
1055                  fwrite($fh, (++$ln == $id) ? $line : $curline);
1056              }
1057              if ($id > $ln) {
1058                  while ($id > ++$ln)
1059                      fwrite($fh, "\n");
1060                  fwrite($fh, $line);
1061              }
1062              fclose($ih);
1063          } else {
1064              $ln = -1;
1065              while ($id > ++$ln)
1066                  fwrite($fh, "\n");
1067              fwrite($fh, $line);
1068          }
1069          fclose($fh);
1070          if ($conf['fperm'])
1071              chmod($fn.'.tmp', $conf['fperm']);
1072          io_rename($fn.'.tmp', $fn.'.idx');
1073          return true;
1074      }
1075  
1076      /**
1077       * Retrieve or insert a value in the index.
1078       *
1079       * @param string    $idx    name of the index
1080       * @param string    $suffix subpart identifier
1081       * @param string    $value  line to find in the index
1082       * @return int|bool          line number of the value in the index or false if writing the index failed
1083       *
1084       * @author Tom N Harris <tnharris@whoopdedo.org>
1085       */
1086      protected function addIndexKey($idx, $suffix, $value) {
1087          $index = $this->getIndex($idx, $suffix);
1088          $id = array_search($value, $index, true);
1089          if ($id === false) {
1090              $id = count($index);
1091              $index[$id] = $value;
1092              if (!$this->saveIndex($idx, $suffix, $index)) {
1093                  trigger_error("Failed to write $idx index", E_USER_ERROR);
1094                  return false;
1095              }
1096          }
1097          return $id;
1098      }
1099  
1100      /**
1101       * Get the list of lengths indexed in the wiki.
1102       *
1103       * Read the index directory or a cache file and returns
1104       * a sorted array of lengths of the words used in the wiki.
1105       *
1106       * @author YoBoY <yoboy.leguesh@gmail.com>
1107       *
1108       * @return array
1109       */
1110      protected function listIndexLengths() {
1111          return idx_listIndexLengths();
1112      }
1113  
1114      /**
1115       * Get the word lengths that have been indexed.
1116       *
1117       * Reads the index directory and returns an array of lengths
1118       * that there are indices for.
1119       *
1120       * @author YoBoY <yoboy.leguesh@gmail.com>
1121       *
1122       * @param array|int $filter
1123       * @return array
1124       */
1125      protected function indexLengths($filter) {
1126          global $conf;
1127          $idx = array();
1128          if (is_array($filter)) {
1129              // testing if index files exist only
1130              $path = $conf['indexdir']."/i";
1131              foreach ($filter as $key => $value) {
1132                  if (file_exists($path.$key.'.idx'))
1133                      $idx[] = $key;
1134              }
1135          } else {
1136              $lengths = idx_listIndexLengths();
1137              foreach ($lengths as $key => $length) {
1138                  // keep all the values equal or superior
1139                  if ((int)$length >= (int)$filter)
1140                      $idx[] = $length;
1141              }
1142          }
1143          return $idx;
1144      }
1145  
1146      /**
1147       * Insert or replace a tuple in a line.
1148       *
1149       * @author Tom N Harris <tnharris@whoopdedo.org>
1150       *
1151       * @param string $line
1152       * @param string|int $id
1153       * @param int    $count
1154       * @return string
1155       */
1156      protected function updateTuple($line, $id, $count) {
1157          if ($line != ''){
1158              $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line);
1159          }
1160          $line = trim($line, ':');
1161          if ($count) {
1162              if ($line) {
1163                  return "$id*$count:".$line;
1164              } else {
1165                  return "$id*$count";
1166              }
1167          }
1168          return $line;
1169      }
1170  
1171      /**
1172       * Split a line into an array of tuples.
1173       *
1174       * @author Tom N Harris <tnharris@whoopdedo.org>
1175       * @author Andreas Gohr <andi@splitbrain.org>
1176       *
1177       * @param array $keys
1178       * @param string $line
1179       * @return array
1180       */
1181      protected function parseTuples(&$keys, $line) {
1182          $result = array();
1183          if ($line == '') return $result;
1184          $parts = explode(':', $line);
1185          foreach ($parts as $tuple) {
1186              if ($tuple === '') continue;
1187              list($key, $cnt) = explode('*', $tuple);
1188              if (!$cnt) continue;
1189              if (isset($keys[$key])) {
1190                  $key = $keys[$key];
1191                  if ($key === false || is_null($key)) continue;
1192              }
1193              $result[$key] = $cnt;
1194          }
1195          return $result;
1196      }
1197  
1198      /**
1199       * Sum the counts in a list of tuples.
1200       *
1201       * @author Tom N Harris <tnharris@whoopdedo.org>
1202       *
1203       * @param string $line
1204       * @return int
1205       */
1206      protected function countTuples($line) {
1207          $freq = 0;
1208          $parts = explode(':', $line);
1209          foreach ($parts as $tuple) {
1210              if ($tuple === '') continue;
1211              list(/* $pid */, $cnt) = explode('*', $tuple);
1212              $freq += (int)$cnt;
1213          }
1214          return $freq;
1215      }
1216  }