[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/ -> indexer.php (source)

   1  <?php
   2  /**
   3   * Functions to create the fulltext search index
   4   *
   5   * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   * @author     Tom N Harris <tnharris@whoopdedo.org>
   8   */
   9  
  10  use dokuwiki\Extension\Event;
  11  use dokuwiki\Search\Indexer;
  12  
  13  // Version tag used to force rebuild on upgrade
  14  define('INDEXER_VERSION', 8);
  15  
  16  // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  17  if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
  18  
  19  /**
  20   * Version of the indexer taking into consideration the external tokenizer.
  21   * The indexer is only compatible with data written by the same version.
  22   *
  23   * @triggers INDEXER_VERSION_GET
  24   * Plugins that modify what gets indexed should hook this event and
  25   * add their version info to the event data like so:
  26   *     $data[$plugin_name] = $plugin_version;
  27   *
  28   * @author Tom N Harris <tnharris@whoopdedo.org>
  29   * @author Michael Hamann <michael@content-space.de>
  30   *
  31   * @return int|string
  32   */
  33  function idx_get_version(){
  34      static $indexer_version = null;
  35      if ($indexer_version == null) {
  36          $version = INDEXER_VERSION;
  37  
  38          // DokuWiki version is included for the convenience of plugins
  39          $data = array('dokuwiki'=>$version);
  40          Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  41          unset($data['dokuwiki']); // this needs to be first
  42          ksort($data);
  43          foreach ($data as $plugin=>$vers)
  44              $version .= '+'.$plugin.'='.$vers;
  45          $indexer_version = $version;
  46      }
  47      return $indexer_version;
  48  }
  49  
  50  /**
  51   * Measure the length of a string.
  52   * Differs from strlen in handling of asian characters.
  53   *
  54   * @author Tom N Harris <tnharris@whoopdedo.org>
  55   *
  56   * @param string $w
  57   * @return int
  58   */
  59  function wordlen($w){
  60      $l = strlen($w);
  61      // If left alone, all chinese "words" will get put into w3.idx
  62      // So the "length" of a "word" is faked
  63      if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
  64          foreach($leadbytes[0] as $b)
  65              $l += ord($b) - 0xE1;
  66      }
  67      return $l;
  68  }
  69  
  70  /**
  71   * Create an instance of the indexer.
  72   *
  73   * @return Indexer    an Indexer
  74   *
  75   * @author Tom N Harris <tnharris@whoopdedo.org>
  76   */
  77  function idx_get_indexer() {
  78      static $Indexer;
  79      if (!isset($Indexer)) {
  80          $Indexer = new Indexer();
  81      }
  82      return $Indexer;
  83  }
  84  
  85  /**
  86   * Returns words that will be ignored.
  87   *
  88   * @return array                list of stop words
  89   *
  90   * @author Tom N Harris <tnharris@whoopdedo.org>
  91   */
  92  function & idx_get_stopwords() {
  93      static $stopwords = null;
  94      if (is_null($stopwords)) {
  95          global $conf;
  96          $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
  97          if(file_exists($swfile)){
  98              $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
  99          }else{
 100              $stopwords = array();
 101          }
 102      }
 103      return $stopwords;
 104  }
 105  
 106  /**
 107   * Adds/updates the search index for the given page
 108   *
 109   * Locking is handled internally.
 110   *
 111   * @param string        $page   name of the page to index
 112   * @param boolean       $verbose    print status messages
 113   * @param boolean       $force  force reindexing even when the index is up to date
 114   * @return string|boolean  the function completed successfully
 115   *
 116   * @author Tom N Harris <tnharris@whoopdedo.org>
 117   */
 118  function idx_addPage($page, $verbose=false, $force=false) {
 119      $idxtag = metaFN($page,'.indexed');
 120      // check if page was deleted but is still in the index
 121      if (!page_exists($page)) {
 122          if (!file_exists($idxtag)) {
 123              if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
 124              return false;
 125          }
 126          $Indexer = idx_get_indexer();
 127          $result = $Indexer->deletePage($page);
 128          if ($result === "locked") {
 129              if ($verbose) print("Indexer: locked".DOKU_LF);
 130              return false;
 131          }
 132          @unlink($idxtag);
 133          return $result;
 134      }
 135  
 136      // check if indexing needed
 137      if(!$force && file_exists($idxtag)){
 138          if(trim(io_readFile($idxtag)) == idx_get_version()){
 139              $last = @filemtime($idxtag);
 140              if($last > @filemtime(wikiFN($page))){
 141                  if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
 142                  return false;
 143              }
 144          }
 145      }
 146  
 147      $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
 148      if ($indexenabled === false) {
 149          $result = false;
 150          if (file_exists($idxtag)) {
 151              $Indexer = idx_get_indexer();
 152              $result = $Indexer->deletePage($page);
 153              if ($result === "locked") {
 154                  if ($verbose) print("Indexer: locked".DOKU_LF);
 155                  return false;
 156              }
 157              @unlink($idxtag);
 158          }
 159          if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
 160          return $result;
 161      }
 162  
 163      $Indexer = idx_get_indexer();
 164      $pid = $Indexer->getPID($page);
 165      if ($pid === false) {
 166          if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
 167          return false;
 168      }
 169      $body = '';
 170      $metadata = array();
 171      $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
 172      if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
 173          $metadata['relation_references'] = array_keys($references);
 174      else
 175          $metadata['relation_references'] = array();
 176  
 177      if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
 178          $metadata['relation_media'] = array_keys($media);
 179      else
 180          $metadata['relation_media'] = array();
 181  
 182      $data = compact('page', 'body', 'metadata', 'pid');
 183      $evt = new Event('INDEXER_PAGE_ADD', $data);
 184      if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
 185      $evt->advise_after();
 186      unset($evt);
 187      extract($data);
 188  
 189      $result = $Indexer->addPageWords($page, $body);
 190      if ($result === "locked") {
 191          if ($verbose) print("Indexer: locked".DOKU_LF);
 192          return false;
 193      }
 194  
 195      if ($result) {
 196          $result = $Indexer->addMetaKeys($page, $metadata);
 197          if ($result === "locked") {
 198              if ($verbose) print("Indexer: locked".DOKU_LF);
 199              return false;
 200          }
 201      }
 202  
 203      if ($result)
 204          io_saveFile(metaFN($page,'.indexed'), idx_get_version());
 205      if ($verbose) {
 206          print("Indexer: finished".DOKU_LF);
 207          return true;
 208      }
 209      return $result;
 210  }
 211  
 212  /**
 213   * Find tokens in the fulltext index
 214   *
 215   * Takes an array of words and will return a list of matching
 216   * pages for each one.
 217   *
 218   * Important: No ACL checking is done here! All results are
 219   *            returned, regardless of permissions
 220   *
 221   * @param array      $words  list of words to search for
 222   * @return array             list of pages found, associated with the search terms
 223   */
 224  function idx_lookup(&$words) {
 225      $Indexer = idx_get_indexer();
 226      return $Indexer->lookup($words);
 227  }
 228  
 229  /**
 230   * Split a string into tokens
 231   *
 232   * @param string $string
 233   * @param bool $wc
 234   *
 235   * @return array
 236   */
 237  function idx_tokenizer($string, $wc=false) {
 238      $Indexer = idx_get_indexer();
 239      return $Indexer->tokenizer($string, $wc);
 240  }
 241  
 242  /* For compatibility */
 243  
 244  /**
 245   * Read the list of words in an index (if it exists).
 246   *
 247   * @author Tom N Harris <tnharris@whoopdedo.org>
 248   *
 249   * @param string $idx
 250   * @param string $suffix
 251   * @return array
 252   */
 253  function idx_getIndex($idx, $suffix) {
 254      global $conf;
 255      $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
 256      if (!file_exists($fn)) return array();
 257      return file($fn);
 258  }
 259  
 260  /**
 261   * Get the list of lengths indexed in the wiki.
 262   *
 263   * Read the index directory or a cache file and returns
 264   * a sorted array of lengths of the words used in the wiki.
 265   *
 266   * @author YoBoY <yoboy.leguesh@gmail.com>
 267   *
 268   * @return array
 269   */
 270  function idx_listIndexLengths() {
 271      global $conf;
 272      // testing what we have to do, create a cache file or not.
 273      if ($conf['readdircache'] == 0) {
 274          $docache = false;
 275      } else {
 276          clearstatcache();
 277          if (file_exists($conf['indexdir'].'/lengths.idx')
 278          && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
 279              if (
 280                  ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
 281                  !== false
 282              ) {
 283                  $idx = array();
 284                  foreach ($lengths as $length) {
 285                      $idx[] = (int)$length;
 286                  }
 287                  return $idx;
 288              }
 289          }
 290          $docache = true;
 291      }
 292  
 293      if ($conf['readdircache'] == 0 || $docache) {
 294          $dir = @opendir($conf['indexdir']);
 295          if ($dir === false)
 296              return array();
 297          $idx = array();
 298          while (($f = readdir($dir)) !== false) {
 299              if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
 300                  $i = substr($f, 1, -4);
 301                  if (is_numeric($i))
 302                      $idx[] = (int)$i;
 303              }
 304          }
 305          closedir($dir);
 306          sort($idx);
 307          // save this in a file
 308          if ($docache) {
 309              $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
 310              @fwrite($handle, implode("\n", $idx));
 311              @fclose($handle);
 312          }
 313          return $idx;
 314      }
 315  
 316      return array();
 317  }
 318  
 319  /**
 320   * Get the word lengths that have been indexed.
 321   *
 322   * Reads the index directory and returns an array of lengths
 323   * that there are indices for.
 324   *
 325   * @author YoBoY <yoboy.leguesh@gmail.com>
 326   *
 327   * @param array|int $filter
 328   * @return array
 329   */
 330  function idx_indexLengths($filter) {
 331      global $conf;
 332      $idx = array();
 333      if (is_array($filter)) {
 334          // testing if index files exist only
 335          $path = $conf['indexdir']."/i";
 336          foreach ($filter as $key => $value) {
 337              if (file_exists($path.$key.'.idx'))
 338                  $idx[] = $key;
 339          }
 340      } else {
 341          $lengths = idx_listIndexLengths();
 342          foreach ($lengths as $key => $length) {
 343              // keep all the values equal or superior
 344              if ((int)$length >= (int)$filter)
 345                  $idx[] = $length;
 346          }
 347      }
 348      return $idx;
 349  }
 350  
 351  /**
 352   * Clean a name of a key for use as a file name.
 353   *
 354   * Romanizes non-latin characters, then strips away anything that's
 355   * not a letter, number, or underscore.
 356   *
 357   * @author Tom N Harris <tnharris@whoopdedo.org>
 358   *
 359   * @param string $name
 360   * @return string
 361   */
 362  function idx_cleanName($name) {
 363      $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
 364      $name = preg_replace('#[ \./\\:-]+#', '_', $name);
 365      $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
 366      return strtolower($name);
 367  }
 368  
 369  //Setup VIM: ex: et ts=4 :