[ Index ] |
PHP Cross Reference of DokuWiki |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Functions to create the fulltext search index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 * @author Tom N Harris <tnharris@whoopdedo.org> 8 */ 9 10 use dokuwiki\Extension\Event; 11 use dokuwiki\Search\Indexer; 12 13 // Version tag used to force rebuild on upgrade 14 define('INDEXER_VERSION', 8); 15 16 // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 17 if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 18 19 /** 20 * Version of the indexer taking into consideration the external tokenizer. 21 * The indexer is only compatible with data written by the same version. 22 * 23 * @triggers INDEXER_VERSION_GET 24 * Plugins that modify what gets indexed should hook this event and 25 * add their version info to the event data like so: 26 * $data[$plugin_name] = $plugin_version; 27 * 28 * @author Tom N Harris <tnharris@whoopdedo.org> 29 * @author Michael Hamann <michael@content-space.de> 30 * 31 * @return int|string 32 */ 33 function idx_get_version(){ 34 static $indexer_version = null; 35 if ($indexer_version == null) { 36 $version = INDEXER_VERSION; 37 38 // DokuWiki version is included for the convenience of plugins 39 $data = array('dokuwiki'=>$version); 40 Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false); 41 unset($data['dokuwiki']); // this needs to be first 42 ksort($data); 43 foreach ($data as $plugin=>$vers) 44 $version .= '+'.$plugin.'='.$vers; 45 $indexer_version = $version; 46 } 47 return $indexer_version; 48 } 49 50 /** 51 * Measure the length of a string. 52 * Differs from strlen in handling of asian characters. 53 * 54 * @author Tom N Harris <tnharris@whoopdedo.org> 55 * 56 * @param string $w 57 * @return int 58 */ 59 function wordlen($w){ 60 $l = strlen($w); 61 // If left alone, all chinese "words" will get put into w3.idx 62 // So the "length" of a "word" is faked 63 if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 64 foreach($leadbytes[0] as $b) 65 $l += ord($b) - 0xE1; 66 } 67 return $l; 68 } 69 70 /** 71 * Create an instance of the indexer. 72 * 73 * @return Indexer an Indexer 74 * 75 * @author Tom N Harris <tnharris@whoopdedo.org> 76 */ 77 function idx_get_indexer() { 78 static $Indexer; 79 if (!isset($Indexer)) { 80 $Indexer = new Indexer(); 81 } 82 return $Indexer; 83 } 84 85 /** 86 * Returns words that will be ignored. 87 * 88 * @return array list of stop words 89 * 90 * @author Tom N Harris <tnharris@whoopdedo.org> 91 */ 92 function & idx_get_stopwords() { 93 static $stopwords = null; 94 if (is_null($stopwords)) { 95 global $conf; 96 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 97 if(file_exists($swfile)){ 98 $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 99 }else{ 100 $stopwords = array(); 101 } 102 } 103 return $stopwords; 104 } 105 106 /** 107 * Adds/updates the search index for the given page 108 * 109 * Locking is handled internally. 110 * 111 * @param string $page name of the page to index 112 * @param boolean $verbose print status messages 113 * @param boolean $force force reindexing even when the index is up to date 114 * @return string|boolean the function completed successfully 115 * 116 * @author Tom N Harris <tnharris@whoopdedo.org> 117 */ 118 function idx_addPage($page, $verbose=false, $force=false) { 119 $idxtag = metaFN($page,'.indexed'); 120 // check if page was deleted but is still in the index 121 if (!page_exists($page)) { 122 if (!file_exists($idxtag)) { 123 if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF); 124 return false; 125 } 126 $Indexer = idx_get_indexer(); 127 $result = $Indexer->deletePage($page); 128 if ($result === "locked") { 129 if ($verbose) print("Indexer: locked".DOKU_LF); 130 return false; 131 } 132 @unlink($idxtag); 133 return $result; 134 } 135 136 // check if indexing needed 137 if(!$force && file_exists($idxtag)){ 138 if(trim(io_readFile($idxtag)) == idx_get_version()){ 139 $last = @filemtime($idxtag); 140 if($last > @filemtime(wikiFN($page))){ 141 if ($verbose) print("Indexer: index for $page up to date".DOKU_LF); 142 return false; 143 } 144 } 145 } 146 147 $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED); 148 if ($indexenabled === false) { 149 $result = false; 150 if (file_exists($idxtag)) { 151 $Indexer = idx_get_indexer(); 152 $result = $Indexer->deletePage($page); 153 if ($result === "locked") { 154 if ($verbose) print("Indexer: locked".DOKU_LF); 155 return false; 156 } 157 @unlink($idxtag); 158 } 159 if ($verbose) print("Indexer: index disabled for $page".DOKU_LF); 160 return $result; 161 } 162 163 $Indexer = idx_get_indexer(); 164 $pid = $Indexer->getPID($page); 165 if ($pid === false) { 166 if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF); 167 return false; 168 } 169 $body = ''; 170 $metadata = array(); 171 $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED); 172 if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null) 173 $metadata['relation_references'] = array_keys($references); 174 else 175 $metadata['relation_references'] = array(); 176 177 if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null) 178 $metadata['relation_media'] = array_keys($media); 179 else 180 $metadata['relation_media'] = array(); 181 182 $data = compact('page', 'body', 'metadata', 'pid'); 183 $evt = new Event('INDEXER_PAGE_ADD', $data); 184 if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page); 185 $evt->advise_after(); 186 unset($evt); 187 extract($data); 188 189 $result = $Indexer->addPageWords($page, $body); 190 if ($result === "locked") { 191 if ($verbose) print("Indexer: locked".DOKU_LF); 192 return false; 193 } 194 195 if ($result) { 196 $result = $Indexer->addMetaKeys($page, $metadata); 197 if ($result === "locked") { 198 if ($verbose) print("Indexer: locked".DOKU_LF); 199 return false; 200 } 201 } 202 203 if ($result) 204 io_saveFile(metaFN($page,'.indexed'), idx_get_version()); 205 if ($verbose) { 206 print("Indexer: finished".DOKU_LF); 207 return true; 208 } 209 return $result; 210 } 211 212 /** 213 * Find tokens in the fulltext index 214 * 215 * Takes an array of words and will return a list of matching 216 * pages for each one. 217 * 218 * Important: No ACL checking is done here! All results are 219 * returned, regardless of permissions 220 * 221 * @param array $words list of words to search for 222 * @return array list of pages found, associated with the search terms 223 */ 224 function idx_lookup(&$words) { 225 $Indexer = idx_get_indexer(); 226 return $Indexer->lookup($words); 227 } 228 229 /** 230 * Split a string into tokens 231 * 232 * @param string $string 233 * @param bool $wc 234 * 235 * @return array 236 */ 237 function idx_tokenizer($string, $wc=false) { 238 $Indexer = idx_get_indexer(); 239 return $Indexer->tokenizer($string, $wc); 240 } 241 242 /* For compatibility */ 243 244 /** 245 * Read the list of words in an index (if it exists). 246 * 247 * @author Tom N Harris <tnharris@whoopdedo.org> 248 * 249 * @param string $idx 250 * @param string $suffix 251 * @return array 252 */ 253 function idx_getIndex($idx, $suffix) { 254 global $conf; 255 $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 256 if (!file_exists($fn)) return array(); 257 return file($fn); 258 } 259 260 /** 261 * Get the list of lengths indexed in the wiki. 262 * 263 * Read the index directory or a cache file and returns 264 * a sorted array of lengths of the words used in the wiki. 265 * 266 * @author YoBoY <yoboy.leguesh@gmail.com> 267 * 268 * @return array 269 */ 270 function idx_listIndexLengths() { 271 global $conf; 272 // testing what we have to do, create a cache file or not. 273 if ($conf['readdircache'] == 0) { 274 $docache = false; 275 } else { 276 clearstatcache(); 277 if (file_exists($conf['indexdir'].'/lengths.idx') 278 && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 279 if ( 280 ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) 281 !== false 282 ) { 283 $idx = array(); 284 foreach ($lengths as $length) { 285 $idx[] = (int)$length; 286 } 287 return $idx; 288 } 289 } 290 $docache = true; 291 } 292 293 if ($conf['readdircache'] == 0 || $docache) { 294 $dir = @opendir($conf['indexdir']); 295 if ($dir === false) 296 return array(); 297 $idx = array(); 298 while (($f = readdir($dir)) !== false) { 299 if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 300 $i = substr($f, 1, -4); 301 if (is_numeric($i)) 302 $idx[] = (int)$i; 303 } 304 } 305 closedir($dir); 306 sort($idx); 307 // save this in a file 308 if ($docache) { 309 $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 310 @fwrite($handle, implode("\n", $idx)); 311 @fclose($handle); 312 } 313 return $idx; 314 } 315 316 return array(); 317 } 318 319 /** 320 * Get the word lengths that have been indexed. 321 * 322 * Reads the index directory and returns an array of lengths 323 * that there are indices for. 324 * 325 * @author YoBoY <yoboy.leguesh@gmail.com> 326 * 327 * @param array|int $filter 328 * @return array 329 */ 330 function idx_indexLengths($filter) { 331 global $conf; 332 $idx = array(); 333 if (is_array($filter)) { 334 // testing if index files exist only 335 $path = $conf['indexdir']."/i"; 336 foreach ($filter as $key => $value) { 337 if (file_exists($path.$key.'.idx')) 338 $idx[] = $key; 339 } 340 } else { 341 $lengths = idx_listIndexLengths(); 342 foreach ($lengths as $key => $length) { 343 // keep all the values equal or superior 344 if ((int)$length >= (int)$filter) 345 $idx[] = $length; 346 } 347 } 348 return $idx; 349 } 350 351 /** 352 * Clean a name of a key for use as a file name. 353 * 354 * Romanizes non-latin characters, then strips away anything that's 355 * not a letter, number, or underscore. 356 * 357 * @author Tom N Harris <tnharris@whoopdedo.org> 358 * 359 * @param string $name 360 * @return string 361 */ 362 function idx_cleanName($name) { 363 $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name)); 364 $name = preg_replace('#[ \./\\:-]+#', '_', $name); 365 $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 366 return strtolower($name); 367 } 368 369 //Setup VIM: ex: et ts=4 :
title
Description
Body
title
Description
Body
title
Description
Body
title
Body