* @author Tom N Harris */ use dokuwiki\Utf8\Clean; use dokuwiki\Extension\Event; use dokuwiki\Search\Indexer; // Version tag used to force rebuild on upgrade define('INDEXER_VERSION', 8); // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2); /** * Version of the indexer taking into consideration the external tokenizer. * The indexer is only compatible with data written by the same version. * * @triggers INDEXER_VERSION_GET * Plugins that modify what gets indexed should hook this event and * add their version info to the event data like so: * $data[$plugin_name] = $plugin_version; * * @author Tom N Harris * @author Michael Hamann * * @return int|string */ function idx_get_version() { static $indexer_version = null; if ($indexer_version == null) { $version = INDEXER_VERSION; // DokuWiki version is included for the convenience of plugins $data = ['dokuwiki' => $version]; Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); unset($data['dokuwiki']); // this needs to be first ksort($data); foreach ($data as $plugin => $vers) $version .= '+' . $plugin . '=' . $vers; $indexer_version = $version; } return $indexer_version; } /** * Measure the length of a string. * Differs from strlen in handling of asian characters. * * @author Tom N Harris * * @param string $w * @return int */ function wordlen($w) { $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) { foreach ($leadbytes[0] as $b) $l += ord($b) - 0xE1; } return $l; } /** * Create an instance of the indexer. * * @return Indexer an Indexer * * @author Tom N Harris */ function idx_get_indexer() { static $Indexer; if (!isset($Indexer)) { $Indexer = new Indexer(); } return $Indexer; } /** * Returns words that will be ignored. * * @return array list of stop words * * @author Tom N Harris */ function & idx_get_stopwords() { static $stopwords = null; if (is_null($stopwords)) { global $conf; $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt'; if (file_exists($swfile)) { $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); } else { $stopwords = []; } } return $stopwords; } /** * Adds/updates the search index for the given page * * Locking is handled internally. * * @param string $page name of the page to index * @param boolean $verbose print status messages * @param boolean $force force reindexing even when the index is up to date * @return string|boolean the function completed successfully * * @author Tom N Harris */ function idx_addPage($page, $verbose = false, $force = false) { $idxtag = metaFN($page, '.indexed'); // check if page was deleted but is still in the index if (!page_exists($page)) { if (!file_exists($idxtag)) { if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF; return false; } $Indexer = idx_get_indexer(); $result = $Indexer->deletePage($page); if ($result === "locked") { if ($verbose) echo "Indexer: locked" . DOKU_LF; return false; } @unlink($idxtag); return $result; } // check if indexing needed if (!$force && file_exists($idxtag)) { if (trim(io_readFile($idxtag)) == idx_get_version()) { $last = @filemtime($idxtag); if ($last > @filemtime(wikiFN($page))) { if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF; return false; } } } $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); if ($indexenabled === false) { $result = false; if (file_exists($idxtag)) { $Indexer = idx_get_indexer(); $result = $Indexer->deletePage($page); if ($result === "locked") { if ($verbose) echo "Indexer: locked" . DOKU_LF; return false; } @unlink($idxtag); } if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF; return $result; } $Indexer = idx_get_indexer(); $pid = $Indexer->getPID($page); if ($pid === false) { if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF; return false; } $body = ''; $metadata = []; $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) $metadata['relation_references'] = array_keys($references); else $metadata['relation_references'] = []; if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) $metadata['relation_media'] = array_keys($media); else $metadata['relation_media'] = []; $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid]; $evt = new Event('INDEXER_PAGE_ADD', $data); if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); $evt->advise_after(); unset($evt); extract($data); $result = $Indexer->addPageWords($page, $body); if ($result === "locked") { if ($verbose) echo "Indexer: locked" . DOKU_LF; return false; } if ($result) { $result = $Indexer->addMetaKeys($page, $metadata); if ($result === "locked") { if ($verbose) echo "Indexer: locked" . DOKU_LF; return false; } } if ($result) io_saveFile(metaFN($page, '.indexed'), idx_get_version()); if ($verbose) { echo "Indexer: finished" . DOKU_LF; return true; } return $result; } /** * Find tokens in the fulltext index * * Takes an array of words and will return a list of matching * pages for each one. * * Important: No ACL checking is done here! All results are * returned, regardless of permissions * * @param array $words list of words to search for * @return array list of pages found, associated with the search terms */ function idx_lookup(&$words) { $Indexer = idx_get_indexer(); return $Indexer->lookup($words); } /** * Split a string into tokens * * @param string $string * @param bool $wc * * @return array */ function idx_tokenizer($string, $wc = false) { $Indexer = idx_get_indexer(); return $Indexer->tokenizer($string, $wc); } /* For compatibility */ /** * Read the list of words in an index (if it exists). * * @author Tom N Harris * * @param string $idx * @param string $suffix * @return array */ function idx_getIndex($idx, $suffix) { global $conf; $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx'; if (!file_exists($fn)) return []; return file($fn); } /** * Get the list of lengths indexed in the wiki. * * Read the index directory or a cache file and returns * a sorted array of lengths of the words used in the wiki. * * @author YoBoY * * @return array */ function idx_listIndexLengths() { global $conf; // testing what we have to do, create a cache file or not. if ($conf['readdircache'] == 0) { $docache = false; } else { clearstatcache(); if ( file_exists($conf['indexdir'] . '/lengths.idx') && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache']) ) { if ( ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false ) { $idx = []; foreach ($lengths as $length) { $idx[] = (int)$length; } return $idx; } } $docache = true; } if ($conf['readdircache'] == 0 || $docache) { $dir = @opendir($conf['indexdir']); if ($dir === false) return []; $idx = []; while (($f = readdir($dir)) !== false) { if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) { $i = substr($f, 1, -4); if (is_numeric($i)) $idx[] = (int)$i; } } closedir($dir); sort($idx); // save this in a file if ($docache) { $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w'); @fwrite($handle, implode("\n", $idx)); @fclose($handle); } return $idx; } return []; } /** * Get the word lengths that have been indexed. * * Reads the index directory and returns an array of lengths * that there are indices for. * * @author YoBoY * * @param array|int $filter * @return array */ function idx_indexLengths($filter) { global $conf; $idx = []; if (is_array($filter)) { // testing if index files exist only $path = $conf['indexdir'] . "/i"; foreach (array_keys($filter) as $key) { if (file_exists($path . $key . '.idx')) $idx[] = $key; } } else { $lengths = idx_listIndexLengths(); foreach ($lengths as $length) { // keep all the values equal or superior if ((int)$length >= (int)$filter) $idx[] = $length; } } return $idx; } /** * Clean a name of a key for use as a file name. * * Romanizes non-latin characters, then strips away anything that's * not a letter, number, or underscore. * * @author Tom N Harris * * @param string $name * @return string */ function idx_cleanName($name) { $name = Clean::romanize(trim((string)$name)); $name = preg_replace('#[ \./\\:-]+#', '_', $name); $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); return strtolower($name); } //Setup VIM: ex: et ts=4 :