You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

383 lines
10 KiB

  1. <?php
  2. /**
  3. * Functions to create the fulltext search index
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. * @author Tom N Harris <tnharris@whoopdedo.org>
  8. */
  9. use dokuwiki\Utf8\Clean;
  10. use dokuwiki\Extension\Event;
  11. use dokuwiki\Search\Indexer;
  12. // Version tag used to force rebuild on upgrade
  13. define('INDEXER_VERSION', 8);
  14. // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
  15. if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH', 2);
  16. /**
  17. * Version of the indexer taking into consideration the external tokenizer.
  18. * The indexer is only compatible with data written by the same version.
  19. *
  20. * @triggers INDEXER_VERSION_GET
  21. * Plugins that modify what gets indexed should hook this event and
  22. * add their version info to the event data like so:
  23. * $data[$plugin_name] = $plugin_version;
  24. *
  25. * @author Tom N Harris <tnharris@whoopdedo.org>
  26. * @author Michael Hamann <michael@content-space.de>
  27. *
  28. * @return int|string
  29. */
  30. function idx_get_version()
  31. {
  32. static $indexer_version = null;
  33. if ($indexer_version == null) {
  34. $version = INDEXER_VERSION;
  35. // DokuWiki version is included for the convenience of plugins
  36. $data = ['dokuwiki' => $version];
  37. Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
  38. unset($data['dokuwiki']); // this needs to be first
  39. ksort($data);
  40. foreach ($data as $plugin => $vers)
  41. $version .= '+' . $plugin . '=' . $vers;
  42. $indexer_version = $version;
  43. }
  44. return $indexer_version;
  45. }
  46. /**
  47. * Measure the length of a string.
  48. * Differs from strlen in handling of asian characters.
  49. *
  50. * @author Tom N Harris <tnharris@whoopdedo.org>
  51. *
  52. * @param string $w
  53. * @return int
  54. */
  55. function wordlen($w)
  56. {
  57. $l = strlen($w);
  58. // If left alone, all chinese "words" will get put into w3.idx
  59. // So the "length" of a "word" is faked
  60. if (preg_match_all('/[\xE2-\xEF]/', $w, $leadbytes)) {
  61. foreach ($leadbytes[0] as $b)
  62. $l += ord($b) - 0xE1;
  63. }
  64. return $l;
  65. }
  66. /**
  67. * Create an instance of the indexer.
  68. *
  69. * @return Indexer an Indexer
  70. *
  71. * @author Tom N Harris <tnharris@whoopdedo.org>
  72. */
  73. function idx_get_indexer()
  74. {
  75. static $Indexer;
  76. if (!isset($Indexer)) {
  77. $Indexer = new Indexer();
  78. }
  79. return $Indexer;
  80. }
  81. /**
  82. * Returns words that will be ignored.
  83. *
  84. * @return array list of stop words
  85. *
  86. * @author Tom N Harris <tnharris@whoopdedo.org>
  87. */
  88. function & idx_get_stopwords()
  89. {
  90. static $stopwords = null;
  91. if (is_null($stopwords)) {
  92. global $conf;
  93. $swfile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
  94. if (file_exists($swfile)) {
  95. $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
  96. } else {
  97. $stopwords = [];
  98. }
  99. }
  100. return $stopwords;
  101. }
  102. /**
  103. * Adds/updates the search index for the given page
  104. *
  105. * Locking is handled internally.
  106. *
  107. * @param string $page name of the page to index
  108. * @param boolean $verbose print status messages
  109. * @param boolean $force force reindexing even when the index is up to date
  110. * @return string|boolean the function completed successfully
  111. *
  112. * @author Tom N Harris <tnharris@whoopdedo.org>
  113. */
  114. function idx_addPage($page, $verbose = false, $force = false)
  115. {
  116. $idxtag = metaFN($page, '.indexed');
  117. // check if page was deleted but is still in the index
  118. if (!page_exists($page)) {
  119. if (!file_exists($idxtag)) {
  120. if ($verbose) echo "Indexer: $page does not exist, ignoring" . DOKU_LF;
  121. return false;
  122. }
  123. $Indexer = idx_get_indexer();
  124. $result = $Indexer->deletePage($page);
  125. if ($result === "locked") {
  126. if ($verbose) echo "Indexer: locked" . DOKU_LF;
  127. return false;
  128. }
  129. @unlink($idxtag);
  130. return $result;
  131. }
  132. // check if indexing needed
  133. if (!$force && file_exists($idxtag)) {
  134. if (trim(io_readFile($idxtag)) == idx_get_version()) {
  135. $last = @filemtime($idxtag);
  136. if ($last > @filemtime(wikiFN($page))) {
  137. if ($verbose) echo "Indexer: index for $page up to date" . DOKU_LF;
  138. return false;
  139. }
  140. }
  141. }
  142. $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
  143. if ($indexenabled === false) {
  144. $result = false;
  145. if (file_exists($idxtag)) {
  146. $Indexer = idx_get_indexer();
  147. $result = $Indexer->deletePage($page);
  148. if ($result === "locked") {
  149. if ($verbose) echo "Indexer: locked" . DOKU_LF;
  150. return false;
  151. }
  152. @unlink($idxtag);
  153. }
  154. if ($verbose) echo "Indexer: index disabled for $page" . DOKU_LF;
  155. return $result;
  156. }
  157. $Indexer = idx_get_indexer();
  158. $pid = $Indexer->getPID($page);
  159. if ($pid === false) {
  160. if ($verbose) echo "Indexer: getting the PID failed for $page" . DOKU_LF;
  161. return false;
  162. }
  163. $body = '';
  164. $metadata = [];
  165. $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
  166. if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
  167. $metadata['relation_references'] = array_keys($references);
  168. else $metadata['relation_references'] = [];
  169. if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
  170. $metadata['relation_media'] = array_keys($media);
  171. else $metadata['relation_media'] = [];
  172. $data = ['page' => $page, 'body' => $body, 'metadata' => $metadata, 'pid' => $pid];
  173. $evt = new Event('INDEXER_PAGE_ADD', $data);
  174. if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
  175. $evt->advise_after();
  176. unset($evt);
  177. extract($data);
  178. $result = $Indexer->addPageWords($page, $body);
  179. if ($result === "locked") {
  180. if ($verbose) echo "Indexer: locked" . DOKU_LF;
  181. return false;
  182. }
  183. if ($result) {
  184. $result = $Indexer->addMetaKeys($page, $metadata);
  185. if ($result === "locked") {
  186. if ($verbose) echo "Indexer: locked" . DOKU_LF;
  187. return false;
  188. }
  189. }
  190. if ($result)
  191. io_saveFile(metaFN($page, '.indexed'), idx_get_version());
  192. if ($verbose) {
  193. echo "Indexer: finished" . DOKU_LF;
  194. return true;
  195. }
  196. return $result;
  197. }
  198. /**
  199. * Find tokens in the fulltext index
  200. *
  201. * Takes an array of words and will return a list of matching
  202. * pages for each one.
  203. *
  204. * Important: No ACL checking is done here! All results are
  205. * returned, regardless of permissions
  206. *
  207. * @param array $words list of words to search for
  208. * @return array list of pages found, associated with the search terms
  209. */
  210. function idx_lookup(&$words)
  211. {
  212. $Indexer = idx_get_indexer();
  213. return $Indexer->lookup($words);
  214. }
  215. /**
  216. * Split a string into tokens
  217. *
  218. * @param string $string
  219. * @param bool $wc
  220. *
  221. * @return array
  222. */
  223. function idx_tokenizer($string, $wc = false)
  224. {
  225. $Indexer = idx_get_indexer();
  226. return $Indexer->tokenizer($string, $wc);
  227. }
  228. /* For compatibility */
  229. /**
  230. * Read the list of words in an index (if it exists).
  231. *
  232. * @author Tom N Harris <tnharris@whoopdedo.org>
  233. *
  234. * @param string $idx
  235. * @param string $suffix
  236. * @return array
  237. */
  238. function idx_getIndex($idx, $suffix)
  239. {
  240. global $conf;
  241. $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
  242. if (!file_exists($fn)) return [];
  243. return file($fn);
  244. }
  245. /**
  246. * Get the list of lengths indexed in the wiki.
  247. *
  248. * Read the index directory or a cache file and returns
  249. * a sorted array of lengths of the words used in the wiki.
  250. *
  251. * @author YoBoY <yoboy.leguesh@gmail.com>
  252. *
  253. * @return array
  254. */
  255. function idx_listIndexLengths()
  256. {
  257. global $conf;
  258. // testing what we have to do, create a cache file or not.
  259. if ($conf['readdircache'] == 0) {
  260. $docache = false;
  261. } else {
  262. clearstatcache();
  263. if (
  264. file_exists($conf['indexdir'] . '/lengths.idx')
  265. && (time() < @filemtime($conf['indexdir'] . '/lengths.idx') + $conf['readdircache'])
  266. ) {
  267. if (
  268. ($lengths = @file($conf['indexdir'] . '/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
  269. !== false
  270. ) {
  271. $idx = [];
  272. foreach ($lengths as $length) {
  273. $idx[] = (int)$length;
  274. }
  275. return $idx;
  276. }
  277. }
  278. $docache = true;
  279. }
  280. if ($conf['readdircache'] == 0 || $docache) {
  281. $dir = @opendir($conf['indexdir']);
  282. if ($dir === false)
  283. return [];
  284. $idx = [];
  285. while (($f = readdir($dir)) !== false) {
  286. if (str_starts_with($f, 'i') && str_ends_with($f, '.idx')) {
  287. $i = substr($f, 1, -4);
  288. if (is_numeric($i))
  289. $idx[] = (int)$i;
  290. }
  291. }
  292. closedir($dir);
  293. sort($idx);
  294. // save this in a file
  295. if ($docache) {
  296. $handle = @fopen($conf['indexdir'] . '/lengths.idx', 'w');
  297. @fwrite($handle, implode("\n", $idx));
  298. @fclose($handle);
  299. }
  300. return $idx;
  301. }
  302. return [];
  303. }
  304. /**
  305. * Get the word lengths that have been indexed.
  306. *
  307. * Reads the index directory and returns an array of lengths
  308. * that there are indices for.
  309. *
  310. * @author YoBoY <yoboy.leguesh@gmail.com>
  311. *
  312. * @param array|int $filter
  313. * @return array
  314. */
  315. function idx_indexLengths($filter)
  316. {
  317. global $conf;
  318. $idx = [];
  319. if (is_array($filter)) {
  320. // testing if index files exist only
  321. $path = $conf['indexdir'] . "/i";
  322. foreach (array_keys($filter) as $key) {
  323. if (file_exists($path . $key . '.idx'))
  324. $idx[] = $key;
  325. }
  326. } else {
  327. $lengths = idx_listIndexLengths();
  328. foreach ($lengths as $length) {
  329. // keep all the values equal or superior
  330. if ((int)$length >= (int)$filter)
  331. $idx[] = $length;
  332. }
  333. }
  334. return $idx;
  335. }
  336. /**
  337. * Clean a name of a key for use as a file name.
  338. *
  339. * Romanizes non-latin characters, then strips away anything that's
  340. * not a letter, number, or underscore.
  341. *
  342. * @author Tom N Harris <tnharris@whoopdedo.org>
  343. *
  344. * @param string $name
  345. * @return string
  346. */
  347. function idx_cleanName($name)
  348. {
  349. $name = Clean::romanize(trim((string)$name));
  350. $name = preg_replace('#[ \./\\:-]+#', '_', $name);
  351. $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
  352. return strtolower($name);
  353. }
  354. //Setup VIM: ex: et ts=4 :