[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/ -> indexer.php (source)

   1  <?php
   2  
   3  /**
   4   * Functions to create the fulltext search index
   5   *
   6   * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   7   * @author     Andreas Gohr <andi@splitbrain.org>
   8   * @author     Tom N Harris <tnharris@whoopdedo.org>
   9   */
  10  
  11  use dokuwiki\Utf8\Clean;
  12  use dokuwiki\Extension\Event;
  13  use dokuwiki\Search\Indexer;
  14  
  15  // Version tag used to force rebuild on upgrade
  16  define('INDEXER_VERSION', 8);
  17  
  18  // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  19  if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
  20  
  21  /**
  22   * Version of the indexer taking into consideration the external tokenizer.
  23   * The indexer is only compatible with data written by the same version.
  24   *
  25   * @triggers INDEXER_VERSION_GET
  26   * Plugins that modify what gets indexed should hook this event and
  27   * add their version info to the event data like so:
  28   *     $data[$plugin_name] = $plugin_version;
  29   *
  30   * @author Tom N Harris <tnharris@whoopdedo.org>
  31   * @author Michael Hamann <michael@content-space.de>
  32   *
  33   * @return int|string
  34   */
  35  function idx_get_version()
  36  {
  37      static $indexer_version = null;
  38      if ($indexer_version == null) {
  39          $version = INDEXER_VERSION;
  40  
  41          // DokuWiki version is included for the convenience of plugins
  42          $data = ['dokuwiki' => $version];
  43          Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  44          unset($data['dokuwiki']); // this needs to be first
  45          ksort($data);
  46          foreach ($data as $plugin => $vers)
  47              $version .= '+' . $plugin . '=' . $vers;
  48          $indexer_version = $version;
  49      }
  50      return $indexer_version;
  51  }
  52  
  53  /**
  54   * Measure the length of a string.
  55   * Differs from strlen in handling of asian characters.
  56   *
  57   * @author Tom N Harris <tnharris@whoopdedo.org>
  58   *
  59   * @param string $w
  60   * @return int
  61   */
  62  function wordlen($w)
  63  {
  64      $l = strlen($w);
  65      // If left alone, all chinese "words" will get put into w3.idx
  66      // So the "length" of a "word" is faked
  67      if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
  68          foreach ($leadbytes[0] as $b)
  69              $l += ord($b) - 0xE1;
  70      }
  71      return $l;
  72  }
  73  
  74  /**
  75   * Create an instance of the indexer.
  76   *
  77   * @return Indexer    an Indexer
  78   *
  79   * @author Tom N Harris <tnharris@whoopdedo.org>
  80   */
  81  function idx_get_indexer()
  82  {
  83      static $Indexer;
  84      if (!isset($Indexer)) {
  85          $Indexer = new Indexer();
  86      }
  87      return $Indexer;
  88  }
  89  
  90  /**
  91   * Returns words that will be ignored.
  92   *
  93   * @return array                list of stop words
  94   *
  95   * @author Tom N Harris <tnharris@whoopdedo.org>
  96   */
  97  function & idx_get_stopwords()
  98  {
  99      static $stopwords = null;
 100      if (is_null($stopwords)) {
 101          global $conf;
 102          $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
 103          if (file_exists($swfile)) {
 104              $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
 105          } else {
 106              $stopwords = [];
 107          }
 108      }
 109      return $stopwords;
 110  }
 111  
 112  /**
 113   * Adds/updates the search index for the given page
 114   *
 115   * Locking is handled internally.
 116   *
 117   * @param string        $page   name of the page to index
 118   * @param boolean       $verbose    print status messages
 119   * @param boolean       $force  force reindexing even when the index is up to date
 120   * @return string|boolean  the function completed successfully
 121   *
 122   * @author Tom N Harris <tnharris@whoopdedo.org>
 123   */
 124  function idx_addPage($page, $verbose = false, $force = false)
 125  {
 126      $idxtag = metaFN($page, '.indexed');
 127      // check if page was deleted but is still in the index
 128      if (!page_exists($page)) {
 129          if (!file_exists($idxtag)) {
 130              if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
 131              return false;
 132          }
 133          $Indexer = idx_get_indexer();
 134          $result = $Indexer->deletePage($page);
 135          if ($result === "locked") {
 136              if ($verbose) echo "Indexer: locked" . DOKU_LF;
 137              return false;
 138          }
 139          @unlink($idxtag);
 140          return $result;
 141      }
 142  
 143      // check if indexing needed
 144      if (!$force && file_exists($idxtag)) {
 145          if (trim(io_readFile($idxtag)) == idx_get_version()) {
 146              $last = @filemtime($idxtag);
 147              if ($last > @filemtime(wikiFN($page))) {
 148                  if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
 149                  return false;
 150              }
 151          }
 152      }
 153  
 154      $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
 155      if ($indexenabled === false) {
 156          $result = false;
 157          if (file_exists($idxtag)) {
 158              $Indexer = idx_get_indexer();
 159              $result = $Indexer->deletePage($page);
 160              if ($result === "locked") {
 161                  if ($verbose) echo "Indexer: locked" . DOKU_LF;
 162                  return false;
 163              }
 164              @unlink($idxtag);
 165          }
 166          if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
 167          return $result;
 168      }
 169  
 170      $Indexer = idx_get_indexer();
 171      $pid = $Indexer->getPID($page);
 172      if ($pid === false) {
 173          if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
 174          return false;
 175      }
 176      $body = '';
 177      $metadata = [];
 178      $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
 179      if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
 180          $metadata['relation_references'] = array_keys($references);
 181      else $metadata['relation_references'] = [];
 182  
 183      if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
 184          $metadata['relation_media'] = array_keys($media);
 185      else $metadata['relation_media'] = [];
 186  
 187      $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
 188      $evt = new Event('INDEXER_PAGE_ADD', $data);
 189      if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
 190      $evt->advise_after();
 191      unset($evt);
 192      extract($data);
 193  
 194      $result = $Indexer->addPageWords($page, $body);
 195      if ($result === "locked") {
 196          if ($verbose) echo "Indexer: locked" . DOKU_LF;
 197          return false;
 198      }
 199  
 200      if ($result) {
 201          $result = $Indexer->addMetaKeys($page, $metadata);
 202          if ($result === "locked") {
 203              if ($verbose) echo "Indexer: locked" . DOKU_LF;
 204              return false;
 205          }
 206      }
 207  
 208      if ($result)
 209          io_saveFile(metaFN($page, '.indexed'), idx_get_version());
 210      if ($verbose) {
 211          echo "Indexer: finished" . DOKU_LF;
 212          return true;
 213      }
 214      return $result;
 215  }
 216  
 217  /**
 218   * Find tokens in the fulltext index
 219   *
 220   * Takes an array of words and will return a list of matching
 221   * pages for each one.
 222   *
 223   * Important: No ACL checking is done here! All results are
 224   *            returned, regardless of permissions
 225   *
 226   * @param array      $words  list of words to search for
 227   * @return array             list of pages found, associated with the search terms
 228   */
 229  function idx_lookup(&$words)
 230  {
 231      $Indexer = idx_get_indexer();
 232      return $Indexer->lookup($words);
 233  }
 234  
 235  /**
 236   * Split a string into tokens
 237   *
 238   * @param string $string
 239   * @param bool $wc
 240   *
 241   * @return array
 242   */
 243  function idx_tokenizer($string, $wc = false)
 244  {
 245      $Indexer = idx_get_indexer();
 246      return $Indexer->tokenizer($string, $wc);
 247  }
 248  
 249  /* For compatibility */
 250  
 251  /**
 252   * Read the list of words in an index (if it exists).
 253   *
 254   * @author Tom N Harris <tnharris@whoopdedo.org>
 255   *
 256   * @param string $idx
 257   * @param string $suffix
 258   * @return array
 259   */
 260  function idx_getIndex($idx, $suffix)
 261  {
 262      global $conf;
 263      $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
 264      if (!file_exists($fn)) return [];
 265      return file($fn);
 266  }
 267  
 268  /**
 269   * Get the list of lengths indexed in the wiki.
 270   *
 271   * Read the index directory or a cache file and returns
 272   * a sorted array of lengths of the words used in the wiki.
 273   *
 274   * @author YoBoY <yoboy.leguesh@gmail.com>
 275   *
 276   * @return array
 277   */
 278  function idx_listIndexLengths()
 279  {
 280      global $conf;
 281      // testing what we have to do, create a cache file or not.
 282      if ($conf['readdircache'] == 0) {
 283          $docache = false;
 284      } else {
 285          clearstatcache();
 286          if (
 287              file_exists($conf['indexdir'] . '/lengths.idx')
 288              && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
 289          ) {
 290              if (
 291                  ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
 292                  !== false
 293              ) {
 294                  $idx = [];
 295                  foreach ($lengths as $length) {
 296                      $idx[] = (int)$length;
 297                  }
 298                  return $idx;
 299              }
 300          }
 301          $docache = true;
 302      }
 303  
 304      if ($conf['readdircache'] == 0 || $docache) {
 305          $dir = @opendir($conf['indexdir']);
 306          if ($dir === false)
 307              return [];
 308          $idx = [];
 309          while (($f = readdir($dir)) !== false) {
 310              if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) {
 311                  $i = substr($f, 1, -4);
 312                  if (is_numeric($i))
 313                      $idx[] = (int)$i;
 314              }
 315          }
 316          closedir($dir);
 317          sort($idx);
 318          // save this in a file
 319          if ($docache) {
 320              $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
 321              @fwrite($handle, implode("\n", $idx));
 322              @fclose($handle);
 323          }
 324          return $idx;
 325      }
 326  
 327      return [];
 328  }
 329  
 330  /**
 331   * Get the word lengths that have been indexed.
 332   *
 333   * Reads the index directory and returns an array of lengths
 334   * that there are indices for.
 335   *
 336   * @author YoBoY <yoboy.leguesh@gmail.com>
 337   *
 338   * @param array|int $filter
 339   * @return array
 340   */
 341  function idx_indexLengths($filter)
 342  {
 343      global $conf;
 344      $idx = [];
 345      if (is_array($filter)) {
 346          // testing if index files exist only
 347          $path = $conf['indexdir'] . "/i";
 348          foreach (array_keys($filter) as $key) {
 349              if (file_exists($path . $key . '.idx'))
 350                  $idx[] = $key;
 351          }
 352      } else {
 353          $lengths = idx_listIndexLengths();
 354          foreach ($lengths as $length) {
 355              // keep all the values equal or superior
 356              if ((int)$length >= (int)$filter)
 357                  $idx[] = $length;
 358          }
 359      }
 360      return $idx;
 361  }
 362  
 363  /**
 364   * Clean a name of a key for use as a file name.
 365   *
 366   * Romanizes non-latin characters, then strips away anything that's
 367   * not a letter, number, or underscore.
 368   *
 369   * @author Tom N Harris <tnharris@whoopdedo.org>
 370   *
 371   * @param string $name
 372   * @return string
 373   */
 374  function idx_cleanName($name)
 375  {
 376      $name = Clean::romanize(trim((string)$name));
 377      $name = preg_replace('#[ \./\\:-]+#', '_', $name);
 378      $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
 379      return strtolower($name);
 380  }
 381  
 382  //Setup VIM: ex: et ts=4 :