[ Index ] |
PHP Cross Reference of DokuWiki |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace dokuwiki\Search; 4 5 use dokuwiki\Extension\Event; 6 7 /** 8 * Class that encapsulates operations on the indexer database. 9 * 10 * @author Tom N Harris <tnharris@whoopdedo.org> 11 */ 12 class Indexer { 13 /** 14 * @var array $pidCache Cache for getPID() 15 */ 16 protected $pidCache = array(); 17 18 /** 19 * Adds the contents of a page to the fulltext index 20 * 21 * The added text replaces previous words for the same page. 22 * An empty value erases the page. 23 * 24 * @param string $page a page name 25 * @param string $text the body of the page 26 * @return string|boolean the function completed successfully 27 * 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 * @author Andreas Gohr <andi@splitbrain.org> 30 */ 31 public function addPageWords($page, $text) { 32 if (!$this->lock()) 33 return "locked"; 34 35 // load known documents 36 $pid = $this->getPIDNoLock($page); 37 if ($pid === false) { 38 $this->unlock(); 39 return false; 40 } 41 42 $pagewords = array(); 43 // get word usage in page 44 $words = $this->getPageWords($text); 45 if ($words === false) { 46 $this->unlock(); 47 return false; 48 } 49 50 if (!empty($words)) { 51 foreach (array_keys($words) as $wlen) { 52 $index = $this->getIndex('i', $wlen); 53 foreach ($words[$wlen] as $wid => $freq) { 54 $idx = ($wid<count($index)) ? $index[$wid] : ''; 55 $index[$wid] = $this->updateTuple($idx, $pid, $freq); 56 $pagewords[] = "$wlen*$wid"; 57 } 58 if (!$this->saveIndex('i', $wlen, $index)) { 59 $this->unlock(); 60 return false; 61 } 62 } 63 } 64 65 // Remove obsolete index entries 66 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 67 if ($pageword_idx !== '') { 68 $oldwords = explode(':',$pageword_idx); 69 $delwords = array_diff($oldwords, $pagewords); 70 $upwords = array(); 71 foreach ($delwords as $word) { 72 if ($word != '') { 73 list($wlen, $wid) = explode('*', $word); 74 $wid = (int)$wid; 75 $upwords[$wlen][] = $wid; 76 } 77 } 78 foreach ($upwords as $wlen => $widx) { 79 $index = $this->getIndex('i', $wlen); 80 foreach ($widx as $wid) { 81 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 82 } 83 $this->saveIndex('i', $wlen, $index); 84 } 85 } 86 // Save the reverse index 87 $pageword_idx = join(':', $pagewords); 88 if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 89 $this->unlock(); 90 return false; 91 } 92 93 $this->unlock(); 94 return true; 95 } 96 97 /** 98 * Split the words in a page and add them to the index. 99 * 100 * @param string $text content of the page 101 * @return array list of word IDs and number of times used 102 * 103 * @author Andreas Gohr <andi@splitbrain.org> 104 * @author Christopher Smith <chris@jalakai.co.uk> 105 * @author Tom N Harris <tnharris@whoopdedo.org> 106 */ 107 protected function getPageWords($text) { 108 109 $tokens = $this->tokenizer($text); 110 $tokens = array_count_values($tokens); // count the frequency of each token 111 112 $words = array(); 113 foreach ($tokens as $w=>$c) { 114 $l = wordlen($w); 115 if (isset($words[$l])){ 116 $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 117 }else{ 118 $words[$l] = array($w => $c); 119 } 120 } 121 122 // arrive here with $words = array(wordlen => array(word => frequency)) 123 $index = array(); //resulting index 124 foreach (array_keys($words) as $wlen) { 125 $word_idx = $this->getIndex('w', $wlen); 126 $word_idx_modified = false; 127 foreach ($words[$wlen] as $word => $freq) { 128 $word = (string)$word; 129 $wid = array_search($word, $word_idx, true); 130 if ($wid === false) { 131 $wid = count($word_idx); 132 $word_idx[] = $word; 133 $word_idx_modified = true; 134 } 135 if (!isset($index[$wlen])) 136 $index[$wlen] = array(); 137 $index[$wlen][$wid] = $freq; 138 } 139 // save back the word index 140 if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) 141 return false; 142 } 143 144 return $index; 145 } 146 147 /** 148 * Add/update keys to/of the metadata index. 149 * 150 * Adding new keys does not remove other keys for the page. 151 * An empty value will erase the key. 152 * The $key parameter can be an array to add multiple keys. $value will 153 * not be used if $key is an array. 154 * 155 * @param string $page a page name 156 * @param mixed $key a key string or array of key=>value pairs 157 * @param mixed $value the value or list of values 158 * @return boolean|string the function completed successfully 159 * 160 * @author Tom N Harris <tnharris@whoopdedo.org> 161 * @author Michael Hamann <michael@content-space.de> 162 */ 163 public function addMetaKeys($page, $key, $value=null) { 164 if (!is_array($key)) { 165 $key = array($key => $value); 166 } elseif (!is_null($value)) { 167 // $key is array, but $value is not null 168 trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 169 } 170 171 if (!$this->lock()) 172 return "locked"; 173 174 // load known documents 175 $pid = $this->getPIDNoLock($page); 176 if ($pid === false) { 177 $this->unlock(); 178 return false; 179 } 180 181 // Special handling for titles so the index file is simpler 182 if (isset($key['title'])) { 183 $value = $key['title']; 184 if (is_array($value)) { 185 $value = $value[0]; 186 } 187 $this->saveIndexKey('title', '', $pid, $value); 188 unset($key['title']); 189 } 190 191 foreach ($key as $name => $values) { 192 $metaname = idx_cleanName($name); 193 $this->addIndexKey('metadata', '', $metaname); 194 $metaidx = $this->getIndex($metaname.'_i', ''); 195 $metawords = $this->getIndex($metaname.'_w', ''); 196 $addwords = false; 197 198 if (!is_array($values)) $values = array($values); 199 200 $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 201 if ($val_idx !== '') { 202 $val_idx = explode(':', $val_idx); 203 // -1 means remove, 0 keep, 1 add 204 $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 205 } else { 206 $val_idx = array(); 207 } 208 209 foreach ($values as $val) { 210 $val = (string)$val; 211 if ($val !== "") { 212 $id = array_search($val, $metawords, true); 213 if ($id === false) { 214 // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx 215 $id = count($metawords); 216 $metawords[$id] = $val; 217 $metaidx[$id] = ''; 218 $addwords = true; 219 } 220 // test if value is already in the index 221 if (isset($val_idx[$id]) && $val_idx[$id] <= 0){ 222 $val_idx[$id] = 0; 223 } else { // else add it 224 $val_idx[$id] = 1; 225 } 226 } 227 } 228 229 if ($addwords) { 230 $this->saveIndex($metaname.'_w', '', $metawords); 231 } 232 $vals_changed = false; 233 foreach ($val_idx as $id => $action) { 234 if ($action == -1) { 235 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 236 $vals_changed = true; 237 unset($val_idx[$id]); 238 } elseif ($action == 1) { 239 $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 240 $vals_changed = true; 241 } 242 } 243 244 if ($vals_changed) { 245 $this->saveIndex($metaname.'_i', '', $metaidx); 246 $val_idx = implode(':', array_keys($val_idx)); 247 $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 248 } 249 250 unset($metaidx); 251 unset($metawords); 252 } 253 254 $this->unlock(); 255 return true; 256 } 257 258 /** 259 * Rename a page in the search index without changing the indexed content. This function doesn't check if the 260 * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the 261 * indexer and it deletes all previously indexed content of the new page. 262 * 263 * @param string $oldpage The old page name 264 * @param string $newpage The new page name 265 * @return string|bool If the page was successfully renamed, can be a message in the case of an error 266 */ 267 public function renamePage($oldpage, $newpage) { 268 if (!$this->lock()) return 'locked'; 269 270 $pages = $this->getPages(); 271 272 $id = array_search($oldpage, $pages, true); 273 if ($id === false) { 274 $this->unlock(); 275 return 'page is not in index'; 276 } 277 278 $new_id = array_search($newpage, $pages, true); 279 if ($new_id !== false) { 280 // make sure the page is not in the index anymore 281 if ($this->deletePageNoLock($newpage) !== true) { 282 return false; 283 } 284 285 $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 286 } 287 288 $pages[$id] = $newpage; 289 290 // update index 291 if (!$this->saveIndex('page', '', $pages)) { 292 $this->unlock(); 293 return false; 294 } 295 296 // reset the pid cache 297 $this->pidCache = array(); 298 299 $this->unlock(); 300 return true; 301 } 302 303 /** 304 * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages 305 * will be updated. 306 * 307 * @param string $key The metadata key of which a value shall be changed 308 * @param string $oldvalue The old value that shall be renamed 309 * @param string $newvalue The new value to which the old value shall be renamed, if exists values will be merged 310 * @return bool|string If renaming the value has been successful, false or error message on error. 311 */ 312 public function renameMetaValue($key, $oldvalue, $newvalue) { 313 if (!$this->lock()) return 'locked'; 314 315 // change the relation references index 316 $metavalues = $this->getIndex($key, '_w'); 317 $oldid = array_search($oldvalue, $metavalues, true); 318 if ($oldid !== false) { 319 $newid = array_search($newvalue, $metavalues, true); 320 if ($newid !== false) { 321 // free memory 322 unset ($metavalues); 323 324 // okay, now we have two entries for the same value. we need to merge them. 325 $indexline = $this->getIndexKey($key.'_i', '', $oldid); 326 if ($indexline != '') { 327 $newindexline = $this->getIndexKey($key.'_i', '', $newid); 328 $pagekeys = $this->getIndex($key.'_p', ''); 329 $parts = explode(':', $indexline); 330 foreach ($parts as $part) { 331 list($id, $count) = explode('*', $part); 332 $newindexline = $this->updateTuple($newindexline, $id, $count); 333 334 $keyline = explode(':', $pagekeys[$id]); 335 // remove old meta value 336 $keyline = array_diff($keyline, array($oldid)); 337 // add new meta value when not already present 338 if (!in_array($newid, $keyline)) { 339 array_push($keyline, $newid); 340 } 341 $pagekeys[$id] = implode(':', $keyline); 342 } 343 $this->saveIndex($key.'_p', '', $pagekeys); 344 unset($pagekeys); 345 $this->saveIndexKey($key.'_i', '', $oldid, ''); 346 $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 347 } 348 } else { 349 $metavalues[$oldid] = $newvalue; 350 if (!$this->saveIndex($key.'_w', '', $metavalues)) { 351 $this->unlock(); 352 return false; 353 } 354 } 355 } 356 357 $this->unlock(); 358 return true; 359 } 360 361 /** 362 * Remove a page from the index 363 * 364 * Erases entries in all known indexes. 365 * 366 * @param string $page a page name 367 * @return string|boolean the function completed successfully 368 * 369 * @author Tom N Harris <tnharris@whoopdedo.org> 370 */ 371 public function deletePage($page) { 372 if (!$this->lock()) 373 return "locked"; 374 375 $result = $this->deletePageNoLock($page); 376 377 $this->unlock(); 378 379 return $result; 380 } 381 382 /** 383 * Remove a page from the index without locking the index, only use this function if the index is already locked 384 * 385 * Erases entries in all known indexes. 386 * 387 * @param string $page a page name 388 * @return boolean the function completed successfully 389 * 390 * @author Tom N Harris <tnharris@whoopdedo.org> 391 */ 392 protected function deletePageNoLock($page) { 393 // load known documents 394 $pid = $this->getPIDNoLock($page); 395 if ($pid === false) { 396 return false; 397 } 398 399 // Remove obsolete index entries 400 $pageword_idx = $this->getIndexKey('pageword', '', $pid); 401 if ($pageword_idx !== '') { 402 $delwords = explode(':',$pageword_idx); 403 $upwords = array(); 404 foreach ($delwords as $word) { 405 if ($word != '') { 406 list($wlen,$wid) = explode('*', $word); 407 $wid = (int)$wid; 408 $upwords[$wlen][] = $wid; 409 } 410 } 411 foreach ($upwords as $wlen => $widx) { 412 $index = $this->getIndex('i', $wlen); 413 foreach ($widx as $wid) { 414 $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 415 } 416 $this->saveIndex('i', $wlen, $index); 417 } 418 } 419 // Save the reverse index 420 if (!$this->saveIndexKey('pageword', '', $pid, "")) { 421 return false; 422 } 423 424 $this->saveIndexKey('title', '', $pid, ""); 425 $keyidx = $this->getIndex('metadata', ''); 426 foreach ($keyidx as $metaname) { 427 $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 428 $meta_idx = $this->getIndex($metaname.'_i', ''); 429 foreach ($val_idx as $id) { 430 if ($id === '') continue; 431 $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 432 } 433 $this->saveIndex($metaname.'_i', '', $meta_idx); 434 $this->saveIndexKey($metaname.'_p', '', $pid, ''); 435 } 436 437 return true; 438 } 439 440 /** 441 * Clear the whole index 442 * 443 * @return bool If the index has been cleared successfully 444 */ 445 public function clear() { 446 global $conf; 447 448 if (!$this->lock()) return false; 449 450 @unlink($conf['indexdir'].'/page.idx'); 451 @unlink($conf['indexdir'].'/title.idx'); 452 @unlink($conf['indexdir'].'/pageword.idx'); 453 @unlink($conf['indexdir'].'/metadata.idx'); 454 $dir = @opendir($conf['indexdir']); 455 if($dir!==false){ 456 while(($f = readdir($dir)) !== false){ 457 if(substr($f,-4)=='.idx' && 458 (substr($f,0,1)=='i' || substr($f,0,1)=='w' 459 || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx')) 460 @unlink($conf['indexdir']."/$f"); 461 } 462 } 463 @unlink($conf['indexdir'].'/lengths.idx'); 464 465 // clear the pid cache 466 $this->pidCache = array(); 467 468 $this->unlock(); 469 return true; 470 } 471 472 /** 473 * Split the text into words for fulltext search 474 * 475 * TODO: does this also need &$stopwords ? 476 * 477 * @triggers INDEXER_TEXT_PREPARE 478 * This event allows plugins to modify the text before it gets tokenized. 479 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 480 * 481 * @param string $text plain text 482 * @param boolean $wc are wildcards allowed? 483 * @return array list of words in the text 484 * 485 * @author Tom N Harris <tnharris@whoopdedo.org> 486 * @author Andreas Gohr <andi@splitbrain.org> 487 */ 488 public function tokenizer($text, $wc=false) { 489 $wc = ($wc) ? '' : '\*'; 490 $stopwords =& idx_get_stopwords(); 491 492 // prepare the text to be tokenized 493 $evt = new Event('INDEXER_TEXT_PREPARE', $text); 494 if ($evt->advise_before(true)) { 495 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 496 $text = \dokuwiki\Utf8\Asian::separateAsianWords($text); 497 } 498 } 499 $evt->advise_after(); 500 unset($evt); 501 502 $text = strtr($text, 503 array( 504 "\r" => ' ', 505 "\n" => ' ', 506 "\t" => ' ', 507 "\xC2\xAD" => '', //soft-hyphen 508 ) 509 ); 510 if (preg_match('/[^0-9A-Za-z ]/u', $text)) 511 $text = \dokuwiki\Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 512 513 $wordlist = explode(' ', $text); 514 foreach ($wordlist as $i => $word) { 515 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 516 \dokuwiki\Utf8\PhpString::strtolower($word) : strtolower($word); 517 } 518 519 foreach ($wordlist as $i => $word) { 520 if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 521 || array_search($word, $stopwords, true) !== false) 522 unset($wordlist[$i]); 523 } 524 return array_values($wordlist); 525 } 526 527 /** 528 * Get the numeric PID of a page 529 * 530 * @param string $page The page to get the PID for 531 * @return bool|int The page id on success, false on error 532 */ 533 public function getPID($page) { 534 // return PID without locking when it is in the cache 535 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 536 537 if (!$this->lock()) 538 return false; 539 540 // load known documents 541 $pid = $this->getPIDNoLock($page); 542 if ($pid === false) { 543 $this->unlock(); 544 return false; 545 } 546 547 $this->unlock(); 548 return $pid; 549 } 550 551 /** 552 * Get the numeric PID of a page without locking the index. 553 * Only use this function when the index is already locked. 554 * 555 * @param string $page The page to get the PID for 556 * @return bool|int The page id on success, false on error 557 */ 558 protected function getPIDNoLock($page) { 559 // avoid expensive addIndexKey operation for the most recently requested pages by using a cache 560 if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 561 $pid = $this->addIndexKey('page', '', $page); 562 // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently 563 // added item will be requested again 564 if (count($this->pidCache) > 10) array_shift($this->pidCache); 565 $this->pidCache[$page] = $pid; 566 return $pid; 567 } 568 569 /** 570 * Get the page id of a numeric PID 571 * 572 * @param int $pid The PID to get the page id for 573 * @return string The page id 574 */ 575 public function getPageFromPID($pid) { 576 return $this->getIndexKey('page', '', $pid); 577 } 578 579 /** 580 * Find pages in the fulltext index containing the words, 581 * 582 * The search words must be pre-tokenized, meaning only letters and 583 * numbers with an optional wildcard 584 * 585 * The returned array will have the original tokens as key. The values 586 * in the returned list is an array with the page names as keys and the 587 * number of times that token appears on the page as value. 588 * 589 * @param array $tokens list of words to search for 590 * @return array list of page names with usage counts 591 * 592 * @author Tom N Harris <tnharris@whoopdedo.org> 593 * @author Andreas Gohr <andi@splitbrain.org> 594 */ 595 public function lookup(&$tokens) { 596 $result = array(); 597 $wids = $this->getIndexWords($tokens, $result); 598 if (empty($wids)) return array(); 599 // load known words and documents 600 $page_idx = $this->getIndex('page', ''); 601 $docs = array(); 602 foreach (array_keys($wids) as $wlen) { 603 $wids[$wlen] = array_unique($wids[$wlen]); 604 $index = $this->getIndex('i', $wlen); 605 foreach($wids[$wlen] as $ixid) { 606 if ($ixid < count($index)) 607 $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 608 } 609 } 610 // merge found pages into final result array 611 $final = array(); 612 foreach ($result as $word => $res) { 613 $final[$word] = array(); 614 foreach ($res as $wid) { 615 // handle the case when ($ixid < count($index)) has been false 616 // and thus $docs[$wid] hasn't been set. 617 if (!isset($docs[$wid])) continue; 618 $hits = &$docs[$wid]; 619 foreach ($hits as $hitkey => $hitcnt) { 620 // make sure the document still exists 621 if (!page_exists($hitkey, '', false)) continue; 622 if (!isset($final[$word][$hitkey])) 623 $final[$word][$hitkey] = $hitcnt; 624 else 625 $final[$word][$hitkey] += $hitcnt; 626 } 627 } 628 } 629 return $final; 630 } 631 632 /** 633 * Find pages containing a metadata key. 634 * 635 * The metadata values are compared as case-sensitive strings. Pass a 636 * callback function that returns true or false to use a different 637 * comparison function. The function will be called with the $value being 638 * searched for as the first argument, and the word in the index as the 639 * second argument. The function preg_match can be used directly if the 640 * values are regexes. 641 * 642 * @param string $key name of the metadata key to look for 643 * @param string $value search term to look for, must be a string or array of strings 644 * @param callback $func comparison function 645 * @return array lists with page names, keys are query values if $value is array 646 * 647 * @author Tom N Harris <tnharris@whoopdedo.org> 648 * @author Michael Hamann <michael@content-space.de> 649 */ 650 public function lookupKey($key, &$value, $func=null) { 651 if (!is_array($value)) 652 $value_array = array($value); 653 else 654 $value_array =& $value; 655 656 // the matching ids for the provided value(s) 657 $value_ids = array(); 658 659 $metaname = idx_cleanName($key); 660 661 // get all words in order to search the matching ids 662 if ($key == 'title') { 663 $words = $this->getIndex('title', ''); 664 } else { 665 $words = $this->getIndex($metaname.'_w', ''); 666 } 667 668 if (!is_null($func)) { 669 foreach ($value_array as $val) { 670 foreach ($words as $i => $word) { 671 if (call_user_func_array($func, array($val, $word))) 672 $value_ids[$i][] = $val; 673 } 674 } 675 } else { 676 foreach ($value_array as $val) { 677 $xval = $val; 678 $caret = '^'; 679 $dollar = '$'; 680 // check for wildcards 681 if (substr($xval, 0, 1) == '*') { 682 $xval = substr($xval, 1); 683 $caret = ''; 684 } 685 if (substr($xval, -1, 1) == '*') { 686 $xval = substr($xval, 0, -1); 687 $dollar = ''; 688 } 689 if (!$caret || !$dollar) { 690 $re = $caret.preg_quote($xval, '/').$dollar; 691 foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) 692 $value_ids[$i][] = $val; 693 } else { 694 if (($i = array_search($val, $words, true)) !== false) 695 $value_ids[$i][] = $val; 696 } 697 } 698 } 699 700 unset($words); // free the used memory 701 702 // initialize the result so it won't be null 703 $result = array(); 704 foreach ($value_array as $val) { 705 $result[$val] = array(); 706 } 707 708 $page_idx = $this->getIndex('page', ''); 709 710 // Special handling for titles 711 if ($key == 'title') { 712 foreach ($value_ids as $pid => $val_list) { 713 $page = $page_idx[$pid]; 714 foreach ($val_list as $val) { 715 $result[$val][] = $page; 716 } 717 } 718 } else { 719 // load all lines and pages so the used lines can be taken and matched with the pages 720 $lines = $this->getIndex($metaname.'_i', ''); 721 722 foreach ($value_ids as $value_id => $val_list) { 723 // parse the tuples of the form page_id*1:page2_id*1 and so on, return value 724 // is an array with page_id => 1, page2_id => 1 etc. so take the keys only 725 $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 726 foreach ($val_list as $val) { 727 $result[$val] = array_merge($result[$val], $pages); 728 } 729 } 730 } 731 if (!is_array($value)) $result = $result[$value]; 732 return $result; 733 } 734 735 /** 736 * Find the index ID of each search term. 737 * 738 * The query terms should only contain valid characters, with a '*' at 739 * either the beginning or end of the word (or both). 740 * The $result parameter can be used to merge the index locations with 741 * the appropriate query term. 742 * 743 * @param array $words The query terms. 744 * @param array $result Set to word => array("length*id" ...) 745 * @return array Set to length => array(id ...) 746 * 747 * @author Tom N Harris <tnharris@whoopdedo.org> 748 */ 749 protected function getIndexWords(&$words, &$result) { 750 $tokens = array(); 751 $tokenlength = array(); 752 $tokenwild = array(); 753 foreach ($words as $word) { 754 $result[$word] = array(); 755 $caret = '^'; 756 $dollar = '$'; 757 $xword = $word; 758 $wlen = wordlen($word); 759 760 // check for wildcards 761 if (substr($xword, 0, 1) == '*') { 762 $xword = substr($xword, 1); 763 $caret = ''; 764 $wlen -= 1; 765 } 766 if (substr($xword, -1, 1) == '*') { 767 $xword = substr($xword, 0, -1); 768 $dollar = ''; 769 $wlen -= 1; 770 } 771 if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) 772 continue; 773 if (!isset($tokens[$xword])) 774 $tokenlength[$wlen][] = $xword; 775 if (!$caret || !$dollar) { 776 $re = $caret.preg_quote($xword, '/').$dollar; 777 $tokens[$xword][] = array($word, '/'.$re.'/'); 778 if (!isset($tokenwild[$xword])) 779 $tokenwild[$xword] = $wlen; 780 } else { 781 $tokens[$xword][] = array($word, null); 782 } 783 } 784 asort($tokenwild); 785 // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 786 // $tokenlength = array( base word length => base word ... ) 787 // $tokenwild = array( base word => base word length ... ) 788 $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 789 $indexes_known = $this->indexLengths($length_filter); 790 if (!empty($tokenwild)) sort($indexes_known); 791 // get word IDs 792 $wids = array(); 793 foreach ($indexes_known as $ixlen) { 794 $word_idx = $this->getIndex('w', $ixlen); 795 // handle exact search 796 if (isset($tokenlength[$ixlen])) { 797 foreach ($tokenlength[$ixlen] as $xword) { 798 $wid = array_search($xword, $word_idx, true); 799 if ($wid !== false) { 800 $wids[$ixlen][] = $wid; 801 foreach ($tokens[$xword] as $w) 802 $result[$w[0]][] = "$ixlen*$wid"; 803 } 804 } 805 } 806 // handle wildcard search 807 foreach ($tokenwild as $xword => $wlen) { 808 if ($wlen >= $ixlen) break; 809 foreach ($tokens[$xword] as $w) { 810 if (is_null($w[1])) continue; 811 foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 812 $wids[$ixlen][] = $wid; 813 $result[$w[0]][] = "$ixlen*$wid"; 814 } 815 } 816 } 817 } 818 return $wids; 819 } 820 821 /** 822 * Return a list of all pages 823 * Warning: pages may not exist! 824 * 825 * @param string $key list only pages containing the metadata key (optional) 826 * @return array list of page names 827 * 828 * @author Tom N Harris <tnharris@whoopdedo.org> 829 */ 830 public function getPages($key=null) { 831 $page_idx = $this->getIndex('page', ''); 832 if (is_null($key)) return $page_idx; 833 834 $metaname = idx_cleanName($key); 835 836 // Special handling for titles 837 if ($key == 'title') { 838 $title_idx = $this->getIndex('title', ''); 839 array_splice($page_idx, count($title_idx)); 840 foreach ($title_idx as $i => $title) 841 if ($title === "") unset($page_idx[$i]); 842 return array_values($page_idx); 843 } 844 845 $pages = array(); 846 $lines = $this->getIndex($metaname.'_i', ''); 847 foreach ($lines as $line) { 848 $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 849 } 850 return array_keys($pages); 851 } 852 853 /** 854 * Return a list of words sorted by number of times used 855 * 856 * @param int $min bottom frequency threshold 857 * @param int $max upper frequency limit. No limit if $max<$min 858 * @param int $minlen minimum length of words to count 859 * @param string $key metadata key to list. Uses the fulltext index if not given 860 * @return array list of words as the keys and frequency as values 861 * 862 * @author Tom N Harris <tnharris@whoopdedo.org> 863 */ 864 public function histogram($min=1, $max=0, $minlen=3, $key=null) { 865 if ($min < 1) 866 $min = 1; 867 if ($max < $min) 868 $max = 0; 869 870 $result = array(); 871 872 if ($key == 'title') { 873 $index = $this->getIndex('title', ''); 874 $index = array_count_values($index); 875 foreach ($index as $val => $cnt) { 876 if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) 877 $result[$val] = $cnt; 878 } 879 } 880 elseif (!is_null($key)) { 881 $metaname = idx_cleanName($key); 882 $index = $this->getIndex($metaname.'_i', ''); 883 $val_idx = array(); 884 foreach ($index as $wid => $line) { 885 $freq = $this->countTuples($line); 886 if ($freq >= $min && (!$max || $freq <= $max)) 887 $val_idx[$wid] = $freq; 888 } 889 if (!empty($val_idx)) { 890 $words = $this->getIndex($metaname.'_w', ''); 891 foreach ($val_idx as $wid => $freq) { 892 if (strlen($words[$wid]) >= $minlen) 893 $result[$words[$wid]] = $freq; 894 } 895 } 896 } 897 else { 898 $lengths = idx_listIndexLengths(); 899 foreach ($lengths as $length) { 900 if ($length < $minlen) continue; 901 $index = $this->getIndex('i', $length); 902 $words = null; 903 foreach ($index as $wid => $line) { 904 $freq = $this->countTuples($line); 905 if ($freq >= $min && (!$max || $freq <= $max)) { 906 if ($words === null) 907 $words = $this->getIndex('w', $length); 908 $result[$words[$wid]] = $freq; 909 } 910 } 911 } 912 } 913 914 arsort($result); 915 return $result; 916 } 917 918 /** 919 * Lock the indexer. 920 * 921 * @author Tom N Harris <tnharris@whoopdedo.org> 922 * 923 * @return bool|string 924 */ 925 protected function lock() { 926 global $conf; 927 $status = true; 928 $run = 0; 929 $lock = $conf['lockdir'].'/_indexer.lock'; 930 while (!@mkdir($lock)) { 931 usleep(50); 932 if(is_dir($lock) && time()-@filemtime($lock) > 60*5){ 933 // looks like a stale lock - remove it 934 if (!@rmdir($lock)) { 935 $status = "removing the stale lock failed"; 936 return false; 937 } else { 938 $status = "stale lock removed"; 939 } 940 }elseif($run++ == 1000){ 941 // we waited 5 seconds for that lock 942 return false; 943 } 944 } 945 if ($conf['dperm']) { 946 chmod($lock, $conf['dperm']); 947 } 948 return $status; 949 } 950 951 /** 952 * Release the indexer lock. 953 * 954 * @author Tom N Harris <tnharris@whoopdedo.org> 955 * 956 * @return bool 957 */ 958 protected function unlock() { 959 global $conf; 960 @rmdir($conf['lockdir'].'/_indexer.lock'); 961 return true; 962 } 963 964 /** 965 * Retrieve the entire index. 966 * 967 * The $suffix argument is for an index that is split into 968 * multiple parts. Different index files should use different 969 * base names. 970 * 971 * @param string $idx name of the index 972 * @param string $suffix subpart identifier 973 * @return array list of lines without CR or LF 974 * 975 * @author Tom N Harris <tnharris@whoopdedo.org> 976 */ 977 protected function getIndex($idx, $suffix) { 978 global $conf; 979 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 980 if (!file_exists($fn)) return array(); 981 return file($fn, FILE_IGNORE_NEW_LINES); 982 } 983 984 /** 985 * Replace the contents of the index with an array. 986 * 987 * @param string $idx name of the index 988 * @param string $suffix subpart identifier 989 * @param array $lines list of lines without LF 990 * @return bool If saving succeeded 991 * 992 * @author Tom N Harris <tnharris@whoopdedo.org> 993 */ 994 protected function saveIndex($idx, $suffix, &$lines) { 995 global $conf; 996 $fn = $conf['indexdir'].'/'.$idx.$suffix; 997 $fh = @fopen($fn.'.tmp', 'w'); 998 if (!$fh) return false; 999 fwrite($fh, join("\n", $lines)); 1000 if (!empty($lines)) 1001 fwrite($fh, "\n"); 1002 fclose($fh); 1003 if ($conf['fperm']) 1004 chmod($fn.'.tmp', $conf['fperm']); 1005 io_rename($fn.'.tmp', $fn.'.idx'); 1006 return true; 1007 } 1008 1009 /** 1010 * Retrieve a line from the index. 1011 * 1012 * @param string $idx name of the index 1013 * @param string $suffix subpart identifier 1014 * @param int $id the line number 1015 * @return string a line with trailing whitespace removed 1016 * 1017 * @author Tom N Harris <tnharris@whoopdedo.org> 1018 */ 1019 protected function getIndexKey($idx, $suffix, $id) { 1020 global $conf; 1021 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1022 if (!file_exists($fn)) return ''; 1023 $fh = @fopen($fn, 'r'); 1024 if (!$fh) return ''; 1025 $ln = -1; 1026 while (($line = fgets($fh)) !== false) { 1027 if (++$ln == $id) break; 1028 } 1029 fclose($fh); 1030 return rtrim((string)$line); 1031 } 1032 1033 /** 1034 * Write a line into the index. 1035 * 1036 * @param string $idx name of the index 1037 * @param string $suffix subpart identifier 1038 * @param int $id the line number 1039 * @param string $line line to write 1040 * @return bool If saving succeeded 1041 * 1042 * @author Tom N Harris <tnharris@whoopdedo.org> 1043 */ 1044 protected function saveIndexKey($idx, $suffix, $id, $line) { 1045 global $conf; 1046 if (substr($line, -1) != "\n") 1047 $line .= "\n"; 1048 $fn = $conf['indexdir'].'/'.$idx.$suffix; 1049 $fh = @fopen($fn.'.tmp', 'w'); 1050 if (!$fh) return false; 1051 $ih = @fopen($fn.'.idx', 'r'); 1052 if ($ih) { 1053 $ln = -1; 1054 while (($curline = fgets($ih)) !== false) { 1055 fwrite($fh, (++$ln == $id) ? $line : $curline); 1056 } 1057 if ($id > $ln) { 1058 while ($id > ++$ln) 1059 fwrite($fh, "\n"); 1060 fwrite($fh, $line); 1061 } 1062 fclose($ih); 1063 } else { 1064 $ln = -1; 1065 while ($id > ++$ln) 1066 fwrite($fh, "\n"); 1067 fwrite($fh, $line); 1068 } 1069 fclose($fh); 1070 if ($conf['fperm']) 1071 chmod($fn.'.tmp', $conf['fperm']); 1072 io_rename($fn.'.tmp', $fn.'.idx'); 1073 return true; 1074 } 1075 1076 /** 1077 * Retrieve or insert a value in the index. 1078 * 1079 * @param string $idx name of the index 1080 * @param string $suffix subpart identifier 1081 * @param string $value line to find in the index 1082 * @return int|bool line number of the value in the index or false if writing the index failed 1083 * 1084 * @author Tom N Harris <tnharris@whoopdedo.org> 1085 */ 1086 protected function addIndexKey($idx, $suffix, $value) { 1087 $index = $this->getIndex($idx, $suffix); 1088 $id = array_search($value, $index, true); 1089 if ($id === false) { 1090 $id = count($index); 1091 $index[$id] = $value; 1092 if (!$this->saveIndex($idx, $suffix, $index)) { 1093 trigger_error("Failed to write $idx index", E_USER_ERROR); 1094 return false; 1095 } 1096 } 1097 return $id; 1098 } 1099 1100 /** 1101 * Get the list of lengths indexed in the wiki. 1102 * 1103 * Read the index directory or a cache file and returns 1104 * a sorted array of lengths of the words used in the wiki. 1105 * 1106 * @author YoBoY <yoboy.leguesh@gmail.com> 1107 * 1108 * @return array 1109 */ 1110 protected function listIndexLengths() { 1111 return idx_listIndexLengths(); 1112 } 1113 1114 /** 1115 * Get the word lengths that have been indexed. 1116 * 1117 * Reads the index directory and returns an array of lengths 1118 * that there are indices for. 1119 * 1120 * @author YoBoY <yoboy.leguesh@gmail.com> 1121 * 1122 * @param array|int $filter 1123 * @return array 1124 */ 1125 protected function indexLengths($filter) { 1126 global $conf; 1127 $idx = array(); 1128 if (is_array($filter)) { 1129 // testing if index files exist only 1130 $path = $conf['indexdir']."/i"; 1131 foreach ($filter as $key => $value) { 1132 if (file_exists($path.$key.'.idx')) 1133 $idx[] = $key; 1134 } 1135 } else { 1136 $lengths = idx_listIndexLengths(); 1137 foreach ($lengths as $key => $length) { 1138 // keep all the values equal or superior 1139 if ((int)$length >= (int)$filter) 1140 $idx[] = $length; 1141 } 1142 } 1143 return $idx; 1144 } 1145 1146 /** 1147 * Insert or replace a tuple in a line. 1148 * 1149 * @author Tom N Harris <tnharris@whoopdedo.org> 1150 * 1151 * @param string $line 1152 * @param string|int $id 1153 * @param int $count 1154 * @return string 1155 */ 1156 protected function updateTuple($line, $id, $count) { 1157 if ($line != ''){ 1158 $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1159 } 1160 $line = trim($line, ':'); 1161 if ($count) { 1162 if ($line) { 1163 return "$id*$count:".$line; 1164 } else { 1165 return "$id*$count"; 1166 } 1167 } 1168 return $line; 1169 } 1170 1171 /** 1172 * Split a line into an array of tuples. 1173 * 1174 * @author Tom N Harris <tnharris@whoopdedo.org> 1175 * @author Andreas Gohr <andi@splitbrain.org> 1176 * 1177 * @param array $keys 1178 * @param string $line 1179 * @return array 1180 */ 1181 protected function parseTuples(&$keys, $line) { 1182 $result = array(); 1183 if ($line == '') return $result; 1184 $parts = explode(':', $line); 1185 foreach ($parts as $tuple) { 1186 if ($tuple === '') continue; 1187 list($key, $cnt) = explode('*', $tuple); 1188 if (!$cnt) continue; 1189 if (isset($keys[$key])) { 1190 $key = $keys[$key]; 1191 if ($key === false || is_null($key)) continue; 1192 } 1193 $result[$key] = $cnt; 1194 } 1195 return $result; 1196 } 1197 1198 /** 1199 * Sum the counts in a list of tuples. 1200 * 1201 * @author Tom N Harris <tnharris@whoopdedo.org> 1202 * 1203 * @param string $line 1204 * @return int 1205 */ 1206 protected function countTuples($line) { 1207 $freq = 0; 1208 $parts = explode(':', $line); 1209 foreach ($parts as $tuple) { 1210 if ($tuple === '') continue; 1211 list(/* $pid */, $cnt) = explode('*', $tuple); 1212 $freq += (int)$cnt; 1213 } 1214 return $freq; 1215 } 1216 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body