You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

1248 lines
41 KiB

  1. <?php
  2. namespace dokuwiki\Search;
  3. use dokuwiki\Utf8\Asian;
  4. use dokuwiki\Utf8\Clean;
  5. use dokuwiki\Utf8\PhpString;
  6. use dokuwiki\Extension\Event;
  7. /**
  8. * Class that encapsulates operations on the indexer database.
  9. *
  10. * @author Tom N Harris <tnharris@whoopdedo.org>
  11. */
  12. class Indexer
  13. {
  14. /**
  15. * @var array $pidCache Cache for getPID()
  16. */
  17. protected $pidCache = [];
  18. /**
  19. * Adds the contents of a page to the fulltext index
  20. *
  21. * The added text replaces previous words for the same page.
  22. * An empty value erases the page.
  23. *
  24. * @param string $page a page name
  25. * @param string $text the body of the page
  26. * @return string|boolean the function completed successfully
  27. *
  28. * @author Tom N Harris <tnharris@whoopdedo.org>
  29. * @author Andreas Gohr <andi@splitbrain.org>
  30. */
  31. public function addPageWords($page, $text)
  32. {
  33. if (!$this->lock())
  34. return "locked";
  35. // load known documents
  36. $pid = $this->getPIDNoLock($page);
  37. if ($pid === false) {
  38. $this->unlock();
  39. return false;
  40. }
  41. $pagewords = [];
  42. // get word usage in page
  43. $words = $this->getPageWords($text);
  44. if ($words === false) {
  45. $this->unlock();
  46. return false;
  47. }
  48. if (!empty($words)) {
  49. foreach (array_keys($words) as $wlen) {
  50. $index = $this->getIndex('i', $wlen);
  51. foreach ($words[$wlen] as $wid => $freq) {
  52. $idx = ($wid < count($index)) ? $index[$wid] : '';
  53. $index[$wid] = $this->updateTuple($idx, $pid, $freq);
  54. $pagewords[] = "$wlen*$wid";
  55. }
  56. if (!$this->saveIndex('i', $wlen, $index)) {
  57. $this->unlock();
  58. return false;
  59. }
  60. }
  61. }
  62. // Remove obsolete index entries
  63. $pageword_idx = $this->getIndexKey('pageword', '', $pid);
  64. if ($pageword_idx !== '') {
  65. $oldwords = explode(':', $pageword_idx);
  66. $delwords = array_diff($oldwords, $pagewords);
  67. $upwords = [];
  68. foreach ($delwords as $word) {
  69. if ($word != '') {
  70. [$wlen, $wid] = explode('*', $word);
  71. $wid = (int)$wid;
  72. $upwords[$wlen][] = $wid;
  73. }
  74. }
  75. foreach ($upwords as $wlen => $widx) {
  76. $index = $this->getIndex('i', $wlen);
  77. foreach ($widx as $wid) {
  78. $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
  79. }
  80. $this->saveIndex('i', $wlen, $index);
  81. }
  82. }
  83. // Save the reverse index
  84. $pageword_idx = implode(':', $pagewords);
  85. if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) {
  86. $this->unlock();
  87. return false;
  88. }
  89. $this->unlock();
  90. return true;
  91. }
  92. /**
  93. * Split the words in a page and add them to the index.
  94. *
  95. * @param string $text content of the page
  96. * @return array list of word IDs and number of times used
  97. *
  98. * @author Andreas Gohr <andi@splitbrain.org>
  99. * @author Christopher Smith <chris@jalakai.co.uk>
  100. * @author Tom N Harris <tnharris@whoopdedo.org>
  101. */
  102. protected function getPageWords($text)
  103. {
  104. $tokens = $this->tokenizer($text);
  105. $tokens = array_count_values($tokens); // count the frequency of each token
  106. $words = [];
  107. foreach ($tokens as $w => $c) {
  108. $l = wordlen($w);
  109. if (isset($words[$l])) {
  110. $words[$l][$w] = $c + ($words[$l][$w] ?? 0);
  111. } else {
  112. $words[$l] = [$w => $c];
  113. }
  114. }
  115. // arrive here with $words = array(wordlen => array(word => frequency))
  116. $index = []; //resulting index
  117. foreach (array_keys($words) as $wlen) {
  118. $word_idx = $this->getIndex('w', $wlen);
  119. $word_idx_modified = false;
  120. foreach ($words[$wlen] as $word => $freq) {
  121. $word = (string)$word;
  122. $wid = array_search($word, $word_idx, true);
  123. if ($wid === false) {
  124. $wid = count($word_idx);
  125. $word_idx[] = $word;
  126. $word_idx_modified = true;
  127. }
  128. if (!isset($index[$wlen]))
  129. $index[$wlen] = [];
  130. $index[$wlen][$wid] = $freq;
  131. }
  132. // save back the word index
  133. if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx))
  134. return false;
  135. }
  136. return $index;
  137. }
  138. /**
  139. * Add/update keys to/of the metadata index.
  140. *
  141. * Adding new keys does not remove other keys for the page.
  142. * An empty value will erase the key.
  143. * The $key parameter can be an array to add multiple keys. $value will
  144. * not be used if $key is an array.
  145. *
  146. * @param string $page a page name
  147. * @param mixed $key a key string or array of key=>value pairs
  148. * @param mixed $value the value or list of values
  149. * @return boolean|string the function completed successfully
  150. *
  151. * @author Tom N Harris <tnharris@whoopdedo.org>
  152. * @author Michael Hamann <michael@content-space.de>
  153. */
  154. public function addMetaKeys($page, $key, $value = null)
  155. {
  156. if (!is_array($key)) {
  157. $key = [$key => $value];
  158. } elseif (!is_null($value)) {
  159. // $key is array, but $value is not null
  160. trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING);
  161. }
  162. if (!$this->lock())
  163. return "locked";
  164. // load known documents
  165. $pid = $this->getPIDNoLock($page);
  166. if ($pid === false) {
  167. $this->unlock();
  168. return false;
  169. }
  170. // Special handling for titles so the index file is simpler
  171. if (isset($key['title'])) {
  172. $value = $key['title'];
  173. if (is_array($value)) {
  174. $value = $value[0];
  175. }
  176. $this->saveIndexKey('title', '', $pid, $value);
  177. unset($key['title']);
  178. }
  179. foreach ($key as $name => $values) {
  180. $metaname = idx_cleanName($name);
  181. $this->addIndexKey('metadata', '', $metaname);
  182. $metaidx = $this->getIndex($metaname . '_i', '');
  183. $metawords = $this->getIndex($metaname . '_w', '');
  184. $addwords = false;
  185. if (!is_array($values)) $values = [$values];
  186. $val_idx = $this->getIndexKey($metaname . '_p', '', $pid);
  187. if ($val_idx !== '') {
  188. $val_idx = explode(':', $val_idx);
  189. // -1 means remove, 0 keep, 1 add
  190. $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1));
  191. } else {
  192. $val_idx = [];
  193. }
  194. foreach ($values as $val) {
  195. $val = (string)$val;
  196. if ($val !== "") {
  197. $id = array_search($val, $metawords, true);
  198. if ($id === false) {
  199. // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx
  200. $id = count($metawords);
  201. $metawords[$id] = $val;
  202. $metaidx[$id] = '';
  203. $addwords = true;
  204. }
  205. // test if value is already in the index
  206. if (isset($val_idx[$id]) && $val_idx[$id] <= 0) {
  207. $val_idx[$id] = 0;
  208. } else { // else add it
  209. $val_idx[$id] = 1;
  210. }
  211. }
  212. }
  213. if ($addwords) {
  214. $this->saveIndex($metaname . '_w', '', $metawords);
  215. }
  216. $vals_changed = false;
  217. foreach ($val_idx as $id => $action) {
  218. if ($action == -1) {
  219. $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0);
  220. $vals_changed = true;
  221. unset($val_idx[$id]);
  222. } elseif ($action == 1) {
  223. $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1);
  224. $vals_changed = true;
  225. }
  226. }
  227. if ($vals_changed) {
  228. $this->saveIndex($metaname . '_i', '', $metaidx);
  229. $val_idx = implode(':', array_keys($val_idx));
  230. $this->saveIndexKey($metaname . '_p', '', $pid, $val_idx);
  231. }
  232. unset($metaidx);
  233. unset($metawords);
  234. }
  235. $this->unlock();
  236. return true;
  237. }
  238. /**
  239. * Rename a page in the search index without changing the indexed content. This function doesn't check if the
  240. * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the
  241. * indexer and it deletes all previously indexed content of the new page.
  242. *
  243. * @param string $oldpage The old page name
  244. * @param string $newpage The new page name
  245. * @return string|bool If the page was successfully renamed, can be a message in the case of an error
  246. */
  247. public function renamePage($oldpage, $newpage)
  248. {
  249. if (!$this->lock()) return 'locked';
  250. $pages = $this->getPages();
  251. $id = array_search($oldpage, $pages, true);
  252. if ($id === false) {
  253. $this->unlock();
  254. return 'page is not in index';
  255. }
  256. $new_id = array_search($newpage, $pages, true);
  257. if ($new_id !== false) {
  258. // make sure the page is not in the index anymore
  259. if (!$this->deletePageNoLock($newpage)) {
  260. return false;
  261. }
  262. $pages[$new_id] = 'deleted:' . time() . random_int(0, 9999);
  263. }
  264. $pages[$id] = $newpage;
  265. // update index
  266. if (!$this->saveIndex('page', '', $pages)) {
  267. $this->unlock();
  268. return false;
  269. }
  270. // reset the pid cache
  271. $this->pidCache = [];
  272. $this->unlock();
  273. return true;
  274. }
  275. /**
  276. * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages
  277. * will be updated.
  278. *
  279. * @param string $key The metadata key of which a value shall be changed
  280. * @param string $oldvalue The old value that shall be renamed
  281. * @param string $newvalue The new value to which the old value shall be renamed, if exists values will be merged
  282. * @return bool|string If renaming the value has been successful, false or error message on error.
  283. */
  284. public function renameMetaValue($key, $oldvalue, $newvalue)
  285. {
  286. if (!$this->lock()) return 'locked';
  287. // change the relation references index
  288. $metavalues = $this->getIndex($key, '_w');
  289. $oldid = array_search($oldvalue, $metavalues, true);
  290. if ($oldid !== false) {
  291. $newid = array_search($newvalue, $metavalues, true);
  292. if ($newid !== false) {
  293. // free memory
  294. unset($metavalues);
  295. // okay, now we have two entries for the same value. we need to merge them.
  296. $indexline = $this->getIndexKey($key . '_i', '', $oldid);
  297. if ($indexline != '') {
  298. $newindexline = $this->getIndexKey($key . '_i', '', $newid);
  299. $pagekeys = $this->getIndex($key . '_p', '');
  300. $parts = explode(':', $indexline);
  301. foreach ($parts as $part) {
  302. [$id, $count] = explode('*', $part);
  303. $newindexline = $this->updateTuple($newindexline, $id, $count);
  304. $keyline = explode(':', $pagekeys[$id]);
  305. // remove old meta value
  306. $keyline = array_diff($keyline, [$oldid]);
  307. // add new meta value when not already present
  308. if (!in_array($newid, $keyline)) {
  309. $keyline[] = $newid;
  310. }
  311. $pagekeys[$id] = implode(':', $keyline);
  312. }
  313. $this->saveIndex($key . '_p', '', $pagekeys);
  314. unset($pagekeys);
  315. $this->saveIndexKey($key . '_i', '', $oldid, '');
  316. $this->saveIndexKey($key . '_i', '', $newid, $newindexline);
  317. }
  318. } else {
  319. $metavalues[$oldid] = $newvalue;
  320. if (!$this->saveIndex($key . '_w', '', $metavalues)) {
  321. $this->unlock();
  322. return false;
  323. }
  324. }
  325. }
  326. $this->unlock();
  327. return true;
  328. }
  329. /**
  330. * Remove a page from the index
  331. *
  332. * Erases entries in all known indexes.
  333. *
  334. * @param string $page a page name
  335. * @return string|boolean the function completed successfully
  336. *
  337. * @author Tom N Harris <tnharris@whoopdedo.org>
  338. */
  339. public function deletePage($page)
  340. {
  341. if (!$this->lock())
  342. return "locked";
  343. $result = $this->deletePageNoLock($page);
  344. $this->unlock();
  345. return $result;
  346. }
  347. /**
  348. * Remove a page from the index without locking the index, only use this function if the index is already locked
  349. *
  350. * Erases entries in all known indexes.
  351. *
  352. * @param string $page a page name
  353. * @return boolean the function completed successfully
  354. *
  355. * @author Tom N Harris <tnharris@whoopdedo.org>
  356. */
  357. protected function deletePageNoLock($page)
  358. {
  359. // load known documents
  360. $pid = $this->getPIDNoLock($page);
  361. if ($pid === false) {
  362. return false;
  363. }
  364. // Remove obsolete index entries
  365. $pageword_idx = $this->getIndexKey('pageword', '', $pid);
  366. if ($pageword_idx !== '') {
  367. $delwords = explode(':', $pageword_idx);
  368. $upwords = [];
  369. foreach ($delwords as $word) {
  370. if ($word != '') {
  371. [$wlen, $wid] = explode('*', $word);
  372. $wid = (int)$wid;
  373. $upwords[$wlen][] = $wid;
  374. }
  375. }
  376. foreach ($upwords as $wlen => $widx) {
  377. $index = $this->getIndex('i', $wlen);
  378. foreach ($widx as $wid) {
  379. $index[$wid] = $this->updateTuple($index[$wid], $pid, 0);
  380. }
  381. $this->saveIndex('i', $wlen, $index);
  382. }
  383. }
  384. // Save the reverse index
  385. if (!$this->saveIndexKey('pageword', '', $pid, "")) {
  386. return false;
  387. }
  388. $this->saveIndexKey('title', '', $pid, "");
  389. $keyidx = $this->getIndex('metadata', '');
  390. foreach ($keyidx as $metaname) {
  391. $val_idx = explode(':', $this->getIndexKey($metaname . '_p', '', $pid));
  392. $meta_idx = $this->getIndex($metaname . '_i', '');
  393. foreach ($val_idx as $id) {
  394. if ($id === '') continue;
  395. $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0);
  396. }
  397. $this->saveIndex($metaname . '_i', '', $meta_idx);
  398. $this->saveIndexKey($metaname . '_p', '', $pid, '');
  399. }
  400. return true;
  401. }
  402. /**
  403. * Clear the whole index
  404. *
  405. * @return bool If the index has been cleared successfully
  406. */
  407. public function clear()
  408. {
  409. global $conf;
  410. if (!$this->lock()) return false;
  411. @unlink($conf['indexdir'] . '/page.idx');
  412. @unlink($conf['indexdir'] . '/title.idx');
  413. @unlink($conf['indexdir'] . '/pageword.idx');
  414. @unlink($conf['indexdir'] . '/metadata.idx');
  415. $dir = @opendir($conf['indexdir']);
  416. if ($dir !== false) {
  417. while (($f = readdir($dir)) !== false) {
  418. if (
  419. str_ends_with($f, '.idx') &&
  420. (str_starts_with($f, 'i') ||
  421. str_starts_with($f, 'w') ||
  422. str_ends_with($f, '_w.idx') ||
  423. str_ends_with($f, '_i.idx') ||
  424. str_ends_with($f, '_p.idx'))
  425. )
  426. @unlink($conf['indexdir'] . "/$f");
  427. }
  428. }
  429. @unlink($conf['indexdir'] . '/lengths.idx');
  430. // clear the pid cache
  431. $this->pidCache = [];
  432. $this->unlock();
  433. return true;
  434. }
  435. /**
  436. * Split the text into words for fulltext search
  437. *
  438. * TODO: does this also need &$stopwords ?
  439. *
  440. * @triggers INDEXER_TEXT_PREPARE
  441. * This event allows plugins to modify the text before it gets tokenized.
  442. * Plugins intercepting this event should also intercept INDEX_VERSION_GET
  443. *
  444. * @param string $text plain text
  445. * @param boolean $wc are wildcards allowed?
  446. * @return array list of words in the text
  447. *
  448. * @author Tom N Harris <tnharris@whoopdedo.org>
  449. * @author Andreas Gohr <andi@splitbrain.org>
  450. */
  451. public function tokenizer($text, $wc = false)
  452. {
  453. $wc = ($wc) ? '' : '\*';
  454. $stopwords =& idx_get_stopwords();
  455. // prepare the text to be tokenized
  456. $evt = new Event('INDEXER_TEXT_PREPARE', $text);
  457. if ($evt->advise_before(true)) {
  458. if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
  459. $text = Asian::separateAsianWords($text);
  460. }
  461. }
  462. $evt->advise_after();
  463. unset($evt);
  464. $text = strtr(
  465. $text,
  466. ["\r" => ' ', "\n" => ' ', "\t" => ' ', "\xC2\xAD" => '']
  467. );
  468. if (preg_match('/[^0-9A-Za-z ]/u', $text))
  469. $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc);
  470. $wordlist = explode(' ', $text);
  471. foreach ($wordlist as $i => $word) {
  472. $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
  473. PhpString::strtolower($word) : strtolower($word);
  474. }
  475. foreach ($wordlist as $i => $word) {
  476. if (
  477. (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH)
  478. || in_array($word, $stopwords, true)
  479. )
  480. unset($wordlist[$i]);
  481. }
  482. return array_values($wordlist);
  483. }
  484. /**
  485. * Get the numeric PID of a page
  486. *
  487. * @param string $page The page to get the PID for
  488. * @return bool|int The page id on success, false on error
  489. */
  490. public function getPID($page)
  491. {
  492. // return PID without locking when it is in the cache
  493. if (isset($this->pidCache[$page])) return $this->pidCache[$page];
  494. if (!$this->lock())
  495. return false;
  496. // load known documents
  497. $pid = $this->getPIDNoLock($page);
  498. if ($pid === false) {
  499. $this->unlock();
  500. return false;
  501. }
  502. $this->unlock();
  503. return $pid;
  504. }
  505. /**
  506. * Get the numeric PID of a page without locking the index.
  507. * Only use this function when the index is already locked.
  508. *
  509. * @param string $page The page to get the PID for
  510. * @return bool|int The page id on success, false on error
  511. */
  512. protected function getPIDNoLock($page)
  513. {
  514. // avoid expensive addIndexKey operation for the most recently requested pages by using a cache
  515. if (isset($this->pidCache[$page])) return $this->pidCache[$page];
  516. $pid = $this->addIndexKey('page', '', $page);
  517. // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently
  518. // added item will be requested again
  519. if (count($this->pidCache) > 10) array_shift($this->pidCache);
  520. $this->pidCache[$page] = $pid;
  521. return $pid;
  522. }
  523. /**
  524. * Get the page id of a numeric PID
  525. *
  526. * @param int $pid The PID to get the page id for
  527. * @return string The page id
  528. */
  529. public function getPageFromPID($pid)
  530. {
  531. return $this->getIndexKey('page', '', $pid);
  532. }
  533. /**
  534. * Find pages in the fulltext index containing the words,
  535. *
  536. * The search words must be pre-tokenized, meaning only letters and
  537. * numbers with an optional wildcard
  538. *
  539. * The returned array will have the original tokens as key. The values
  540. * in the returned list is an array with the page names as keys and the
  541. * number of times that token appears on the page as value.
  542. *
  543. * @param array $tokens list of words to search for
  544. * @return array list of page names with usage counts
  545. *
  546. * @author Tom N Harris <tnharris@whoopdedo.org>
  547. * @author Andreas Gohr <andi@splitbrain.org>
  548. */
  549. public function lookup(&$tokens)
  550. {
  551. $result = [];
  552. $wids = $this->getIndexWords($tokens, $result);
  553. if (empty($wids)) return [];
  554. // load known words and documents
  555. $page_idx = $this->getIndex('page', '');
  556. $docs = [];
  557. foreach (array_keys($wids) as $wlen) {
  558. $wids[$wlen] = array_unique($wids[$wlen]);
  559. $index = $this->getIndex('i', $wlen);
  560. foreach ($wids[$wlen] as $ixid) {
  561. if ($ixid < count($index))
  562. $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]);
  563. }
  564. }
  565. // merge found pages into final result array
  566. $final = [];
  567. foreach ($result as $word => $res) {
  568. $final[$word] = [];
  569. foreach ($res as $wid) {
  570. // handle the case when ($ixid < count($index)) has been false
  571. // and thus $docs[$wid] hasn't been set.
  572. if (!isset($docs[$wid])) continue;
  573. $hits = &$docs[$wid];
  574. foreach ($hits as $hitkey => $hitcnt) {
  575. // make sure the document still exists
  576. if (!page_exists($hitkey, '', false)) continue;
  577. if (!isset($final[$word][$hitkey]))
  578. $final[$word][$hitkey] = $hitcnt;
  579. else $final[$word][$hitkey] += $hitcnt;
  580. }
  581. }
  582. }
  583. return $final;
  584. }
  585. /**
  586. * Find pages containing a metadata key.
  587. *
  588. * The metadata values are compared as case-sensitive strings. Pass a
  589. * callback function that returns true or false to use a different
  590. * comparison function. The function will be called with the $value being
  591. * searched for as the first argument, and the word in the index as the
  592. * second argument. The function preg_match can be used directly if the
  593. * values are regexes.
  594. *
  595. * @param string $key name of the metadata key to look for
  596. * @param string $value search term to look for, must be a string or array of strings
  597. * @param callback $func comparison function
  598. * @return array lists with page names, keys are query values if $value is array
  599. *
  600. * @author Tom N Harris <tnharris@whoopdedo.org>
  601. * @author Michael Hamann <michael@content-space.de>
  602. */
  603. public function lookupKey($key, &$value, $func = null)
  604. {
  605. if (!is_array($value))
  606. $value_array = [$value];
  607. else $value_array =& $value;
  608. // the matching ids for the provided value(s)
  609. $value_ids = [];
  610. $metaname = idx_cleanName($key);
  611. // get all words in order to search the matching ids
  612. if ($key == 'title') {
  613. $words = $this->getIndex('title', '');
  614. } else {
  615. $words = $this->getIndex($metaname . '_w', '');
  616. }
  617. if (!is_null($func)) {
  618. foreach ($value_array as $val) {
  619. foreach ($words as $i => $word) {
  620. if (call_user_func_array($func, [$val, $word]))
  621. $value_ids[$i][] = $val;
  622. }
  623. }
  624. } else {
  625. foreach ($value_array as $val) {
  626. $xval = $val;
  627. $caret = '^';
  628. $dollar = '$';
  629. // check for wildcards
  630. if (str_starts_with($xval, '*')) {
  631. $xval = substr($xval, 1);
  632. $caret = '';
  633. }
  634. if (str_ends_with($xval, '*')) {
  635. $xval = substr($xval, 0, -1);
  636. $dollar = '';
  637. }
  638. if (!$caret || !$dollar) {
  639. $re = $caret . preg_quote($xval, '/') . $dollar;
  640. foreach (array_keys(preg_grep('/' . $re . '/', $words)) as $i)
  641. $value_ids[$i][] = $val;
  642. } elseif (($i = array_search($val, $words, true)) !== false) {
  643. $value_ids[$i][] = $val;
  644. }
  645. }
  646. }
  647. unset($words); // free the used memory
  648. // initialize the result so it won't be null
  649. $result = [];
  650. foreach ($value_array as $val) {
  651. $result[$val] = [];
  652. }
  653. $page_idx = $this->getIndex('page', '');
  654. // Special handling for titles
  655. if ($key == 'title') {
  656. foreach ($value_ids as $pid => $val_list) {
  657. $page = $page_idx[$pid];
  658. foreach ($val_list as $val) {
  659. $result[$val][] = $page;
  660. }
  661. }
  662. } else {
  663. // load all lines and pages so the used lines can be taken and matched with the pages
  664. $lines = $this->getIndex($metaname . '_i', '');
  665. foreach ($value_ids as $value_id => $val_list) {
  666. // parse the tuples of the form page_id*1:page2_id*1 and so on, return value
  667. // is an array with page_id => 1, page2_id => 1 etc. so take the keys only
  668. $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id]));
  669. foreach ($val_list as $val) {
  670. $result[$val] = [...$result[$val], ...$pages];
  671. }
  672. }
  673. }
  674. if (!is_array($value)) $result = $result[$value];
  675. return $result;
  676. }
  677. /**
  678. * Find the index ID of each search term.
  679. *
  680. * The query terms should only contain valid characters, with a '*' at
  681. * either the beginning or end of the word (or both).
  682. * The $result parameter can be used to merge the index locations with
  683. * the appropriate query term.
  684. *
  685. * @param array $words The query terms.
  686. * @param array $result Set to word => array("length*id" ...)
  687. * @return array Set to length => array(id ...)
  688. *
  689. * @author Tom N Harris <tnharris@whoopdedo.org>
  690. */
  691. protected function getIndexWords(&$words, &$result)
  692. {
  693. $tokens = [];
  694. $tokenlength = [];
  695. $tokenwild = [];
  696. foreach ($words as $word) {
  697. $result[$word] = [];
  698. $caret = '^';
  699. $dollar = '$';
  700. $xword = $word;
  701. $wlen = wordlen($word);
  702. // check for wildcards
  703. if (str_starts_with($xword, '*')) {
  704. $xword = substr($xword, 1);
  705. $caret = '';
  706. --$wlen;
  707. }
  708. if (str_ends_with($xword, '*')) {
  709. $xword = substr($xword, 0, -1);
  710. $dollar = '';
  711. --$wlen;
  712. }
  713. if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword))
  714. continue;
  715. if (!isset($tokens[$xword]))
  716. $tokenlength[$wlen][] = $xword;
  717. if (!$caret || !$dollar) {
  718. $re = $caret . preg_quote($xword, '/') . $dollar;
  719. $tokens[$xword][] = [$word, '/' . $re . '/'];
  720. if (!isset($tokenwild[$xword]))
  721. $tokenwild[$xword] = $wlen;
  722. } else {
  723. $tokens[$xword][] = [$word, null];
  724. }
  725. }
  726. asort($tokenwild);
  727. // $tokens = array( base word => array( [ query term , regexp ] ... ) ... )
  728. // $tokenlength = array( base word length => base word ... )
  729. // $tokenwild = array( base word => base word length ... )
  730. $length_filter = $tokenwild === [] ? $tokenlength : min(array_keys($tokenlength));
  731. $indexes_known = $this->indexLengths($length_filter);
  732. if ($tokenwild !== []) sort($indexes_known);
  733. // get word IDs
  734. $wids = [];
  735. foreach ($indexes_known as $ixlen) {
  736. $word_idx = $this->getIndex('w', $ixlen);
  737. // handle exact search
  738. if (isset($tokenlength[$ixlen])) {
  739. foreach ($tokenlength[$ixlen] as $xword) {
  740. $wid = array_search($xword, $word_idx, true);
  741. if ($wid !== false) {
  742. $wids[$ixlen][] = $wid;
  743. foreach ($tokens[$xword] as $w)
  744. $result[$w[0]][] = "$ixlen*$wid";
  745. }
  746. }
  747. }
  748. // handle wildcard search
  749. foreach ($tokenwild as $xword => $wlen) {
  750. if ($wlen >= $ixlen) break;
  751. foreach ($tokens[$xword] as $w) {
  752. if (is_null($w[1])) continue;
  753. foreach (array_keys(preg_grep($w[1], $word_idx)) as $wid) {
  754. $wids[$ixlen][] = $wid;
  755. $result[$w[0]][] = "$ixlen*$wid";
  756. }
  757. }
  758. }
  759. }
  760. return $wids;
  761. }
  762. /**
  763. * Return a list of all pages
  764. * Warning: pages may not exist!
  765. *
  766. * @param string $key list only pages containing the metadata key (optional)
  767. * @return array list of page names
  768. *
  769. * @author Tom N Harris <tnharris@whoopdedo.org>
  770. */
  771. public function getPages($key = null)
  772. {
  773. $page_idx = $this->getIndex('page', '');
  774. if (is_null($key)) return $page_idx;
  775. $metaname = idx_cleanName($key);
  776. // Special handling for titles
  777. if ($key == 'title') {
  778. $title_idx = $this->getIndex('title', '');
  779. array_splice($page_idx, count($title_idx));
  780. foreach ($title_idx as $i => $title)
  781. if ($title === "") unset($page_idx[$i]);
  782. return array_values($page_idx);
  783. }
  784. $pages = [];
  785. $lines = $this->getIndex($metaname . '_i', '');
  786. foreach ($lines as $line) {
  787. $pages = array_merge($pages, $this->parseTuples($page_idx, $line));
  788. }
  789. return array_keys($pages);
  790. }
  791. /**
  792. * Return a list of words sorted by number of times used
  793. *
  794. * @param int $min bottom frequency threshold
  795. * @param int $max upper frequency limit. No limit if $max<$min
  796. * @param int $minlen minimum length of words to count
  797. * @param string $key metadata key to list. Uses the fulltext index if not given
  798. * @return array list of words as the keys and frequency as values
  799. *
  800. * @author Tom N Harris <tnharris@whoopdedo.org>
  801. */
  802. public function histogram($min = 1, $max = 0, $minlen = 3, $key = null)
  803. {
  804. if ($min < 1)
  805. $min = 1;
  806. if ($max < $min)
  807. $max = 0;
  808. $result = [];
  809. if ($key == 'title') {
  810. $index = $this->getIndex('title', '');
  811. $index = array_count_values($index);
  812. foreach ($index as $val => $cnt) {
  813. if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen)
  814. $result[$val] = $cnt;
  815. }
  816. } elseif (!is_null($key)) {
  817. $metaname = idx_cleanName($key);
  818. $index = $this->getIndex($metaname . '_i', '');
  819. $val_idx = [];
  820. foreach ($index as $wid => $line) {
  821. $freq = $this->countTuples($line);
  822. if ($freq >= $min && (!$max || $freq <= $max))
  823. $val_idx[$wid] = $freq;
  824. }
  825. if (!empty($val_idx)) {
  826. $words = $this->getIndex($metaname . '_w', '');
  827. foreach ($val_idx as $wid => $freq) {
  828. if (strlen($words[$wid]) >= $minlen)
  829. $result[$words[$wid]] = $freq;
  830. }
  831. }
  832. } else {
  833. $lengths = idx_listIndexLengths();
  834. foreach ($lengths as $length) {
  835. if ($length < $minlen) continue;
  836. $index = $this->getIndex('i', $length);
  837. $words = null;
  838. foreach ($index as $wid => $line) {
  839. $freq = $this->countTuples($line);
  840. if ($freq >= $min && (!$max || $freq <= $max)) {
  841. if ($words === null)
  842. $words = $this->getIndex('w', $length);
  843. $result[$words[$wid]] = $freq;
  844. }
  845. }
  846. }
  847. }
  848. arsort($result);
  849. return $result;
  850. }
  851. /**
  852. * Lock the indexer.
  853. *
  854. * @author Tom N Harris <tnharris@whoopdedo.org>
  855. *
  856. * @return bool|string
  857. */
  858. protected function lock()
  859. {
  860. global $conf;
  861. $status = true;
  862. $run = 0;
  863. $lock = $conf['lockdir'] . '/_indexer.lock';
  864. while (!@mkdir($lock)) {
  865. usleep(50);
  866. if (is_dir($lock) && time() - @filemtime($lock) > 60 * 5) {
  867. // looks like a stale lock - remove it
  868. if (!@rmdir($lock)) {
  869. $status = "removing the stale lock failed";
  870. return false;
  871. } else {
  872. $status = "stale lock removed";
  873. }
  874. } elseif ($run++ == 1000) {
  875. // we waited 5 seconds for that lock
  876. return false;
  877. }
  878. }
  879. if ($conf['dperm']) {
  880. chmod($lock, $conf['dperm']);
  881. }
  882. return $status;
  883. }
  884. /**
  885. * Release the indexer lock.
  886. *
  887. * @author Tom N Harris <tnharris@whoopdedo.org>
  888. *
  889. * @return bool
  890. */
  891. protected function unlock()
  892. {
  893. global $conf;
  894. @rmdir($conf['lockdir'] . '/_indexer.lock');
  895. return true;
  896. }
  897. /**
  898. * Retrieve the entire index.
  899. *
  900. * The $suffix argument is for an index that is split into
  901. * multiple parts. Different index files should use different
  902. * base names.
  903. *
  904. * @param string $idx name of the index
  905. * @param string $suffix subpart identifier
  906. * @return array list of lines without CR or LF
  907. *
  908. * @author Tom N Harris <tnharris@whoopdedo.org>
  909. */
  910. protected function getIndex($idx, $suffix)
  911. {
  912. global $conf;
  913. $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
  914. if (!file_exists($fn)) return [];
  915. return file($fn, FILE_IGNORE_NEW_LINES);
  916. }
  917. /**
  918. * Replace the contents of the index with an array.
  919. *
  920. * @param string $idx name of the index
  921. * @param string $suffix subpart identifier
  922. * @param array $lines list of lines without LF
  923. * @return bool If saving succeeded
  924. *
  925. * @author Tom N Harris <tnharris@whoopdedo.org>
  926. */
  927. protected function saveIndex($idx, $suffix, &$lines)
  928. {
  929. global $conf;
  930. $fn = $conf['indexdir'] . '/' . $idx . $suffix;
  931. $fh = @fopen($fn . '.tmp', 'w');
  932. if (!$fh) return false;
  933. fwrite($fh, implode("\n", $lines));
  934. if (!empty($lines))
  935. fwrite($fh, "\n");
  936. fclose($fh);
  937. if ($conf['fperm'])
  938. chmod($fn . '.tmp', $conf['fperm']);
  939. io_rename($fn . '.tmp', $fn . '.idx');
  940. return true;
  941. }
  942. /**
  943. * Retrieve a line from the index.
  944. *
  945. * @param string $idx name of the index
  946. * @param string $suffix subpart identifier
  947. * @param int $id the line number
  948. * @return string a line with trailing whitespace removed
  949. *
  950. * @author Tom N Harris <tnharris@whoopdedo.org>
  951. */
  952. protected function getIndexKey($idx, $suffix, $id)
  953. {
  954. global $conf;
  955. $fn = $conf['indexdir'] . '/' . $idx . $suffix . '.idx';
  956. if (!file_exists($fn)) return '';
  957. $fh = @fopen($fn, 'r');
  958. if (!$fh) return '';
  959. $ln = -1;
  960. while (($line = fgets($fh)) !== false) {
  961. if (++$ln == $id) break;
  962. }
  963. fclose($fh);
  964. return rtrim((string)$line);
  965. }
  966. /**
  967. * Write a line into the index.
  968. *
  969. * @param string $idx name of the index
  970. * @param string $suffix subpart identifier
  971. * @param int $id the line number
  972. * @param string $line line to write
  973. * @return bool If saving succeeded
  974. *
  975. * @author Tom N Harris <tnharris@whoopdedo.org>
  976. */
  977. protected function saveIndexKey($idx, $suffix, $id, $line)
  978. {
  979. global $conf;
  980. if (!str_ends_with($line, "\n"))
  981. $line .= "\n";
  982. $fn = $conf['indexdir'] . '/' . $idx . $suffix;
  983. $fh = @fopen($fn . '.tmp', 'w');
  984. if (!$fh) return false;
  985. $ih = @fopen($fn . '.idx', 'r');
  986. if ($ih) {
  987. $ln = -1;
  988. while (($curline = fgets($ih)) !== false) {
  989. fwrite($fh, (++$ln == $id) ? $line : $curline);
  990. }
  991. if ($id > $ln) {
  992. while ($id > ++$ln)
  993. fwrite($fh, "\n");
  994. fwrite($fh, $line);
  995. }
  996. fclose($ih);
  997. } else {
  998. $ln = -1;
  999. while ($id > ++$ln)
  1000. fwrite($fh, "\n");
  1001. fwrite($fh, $line);
  1002. }
  1003. fclose($fh);
  1004. if ($conf['fperm'])
  1005. chmod($fn . '.tmp', $conf['fperm']);
  1006. io_rename($fn . '.tmp', $fn . '.idx');
  1007. return true;
  1008. }
  1009. /**
  1010. * Retrieve or insert a value in the index.
  1011. *
  1012. * @param string $idx name of the index
  1013. * @param string $suffix subpart identifier
  1014. * @param string $value line to find in the index
  1015. * @return int|bool line number of the value in the index or false if writing the index failed
  1016. *
  1017. * @author Tom N Harris <tnharris@whoopdedo.org>
  1018. */
  1019. protected function addIndexKey($idx, $suffix, $value)
  1020. {
  1021. $index = $this->getIndex($idx, $suffix);
  1022. $id = array_search($value, $index, true);
  1023. if ($id === false) {
  1024. $id = count($index);
  1025. $index[$id] = $value;
  1026. if (!$this->saveIndex($idx, $suffix, $index)) {
  1027. trigger_error("Failed to write $idx index", E_USER_ERROR);
  1028. return false;
  1029. }
  1030. }
  1031. return $id;
  1032. }
  1033. /**
  1034. * Get the list of lengths indexed in the wiki.
  1035. *
  1036. * Read the index directory or a cache file and returns
  1037. * a sorted array of lengths of the words used in the wiki.
  1038. *
  1039. * @author YoBoY <yoboy.leguesh@gmail.com>
  1040. *
  1041. * @return array
  1042. */
  1043. protected function listIndexLengths()
  1044. {
  1045. return idx_listIndexLengths();
  1046. }
  1047. /**
  1048. * Get the word lengths that have been indexed.
  1049. *
  1050. * Reads the index directory and returns an array of lengths
  1051. * that there are indices for.
  1052. *
  1053. * @author YoBoY <yoboy.leguesh@gmail.com>
  1054. *
  1055. * @param array|int $filter
  1056. * @return array
  1057. */
  1058. protected function indexLengths($filter)
  1059. {
  1060. global $conf;
  1061. $idx = [];
  1062. if (is_array($filter)) {
  1063. // testing if index files exist only
  1064. $path = $conf['indexdir'] . "/i";
  1065. foreach (array_keys($filter) as $key) {
  1066. if (file_exists($path . $key . '.idx'))
  1067. $idx[] = $key;
  1068. }
  1069. } else {
  1070. $lengths = idx_listIndexLengths();
  1071. foreach ($lengths as $length) {
  1072. // keep all the values equal or superior
  1073. if ((int)$length >= (int)$filter)
  1074. $idx[] = $length;
  1075. }
  1076. }
  1077. return $idx;
  1078. }
  1079. /**
  1080. * Insert or replace a tuple in a line.
  1081. *
  1082. * @author Tom N Harris <tnharris@whoopdedo.org>
  1083. *
  1084. * @param string $line
  1085. * @param string|int $id
  1086. * @param int $count
  1087. * @return string
  1088. */
  1089. protected function updateTuple($line, $id, $count)
  1090. {
  1091. if ($line != '') {
  1092. $line = preg_replace('/(^|:)' . preg_quote($id, '/') . '\*\d*/', '', $line);
  1093. }
  1094. $line = trim($line, ':');
  1095. if ($count) {
  1096. if ($line) {
  1097. return "$id*$count:" . $line;
  1098. } else {
  1099. return "$id*$count";
  1100. }
  1101. }
  1102. return $line;
  1103. }
  1104. /**
  1105. * Split a line into an array of tuples.
  1106. *
  1107. * @author Tom N Harris <tnharris@whoopdedo.org>
  1108. * @author Andreas Gohr <andi@splitbrain.org>
  1109. *
  1110. * @param array $keys
  1111. * @param string $line
  1112. * @return array
  1113. */
  1114. protected function parseTuples(&$keys, $line)
  1115. {
  1116. $result = [];
  1117. if ($line == '') return $result;
  1118. $parts = explode(':', $line);
  1119. foreach ($parts as $tuple) {
  1120. if ($tuple === '') continue;
  1121. [$key, $cnt] = explode('*', $tuple);
  1122. if (!$cnt) continue;
  1123. if (isset($keys[$key])) {
  1124. $key = $keys[$key];
  1125. if ($key === false || is_null($key)) continue;
  1126. }
  1127. $result[$key] = $cnt;
  1128. }
  1129. return $result;
  1130. }
  1131. /**
  1132. * Sum the counts in a list of tuples.
  1133. *
  1134. * @author Tom N Harris <tnharris@whoopdedo.org>
  1135. *
  1136. * @param string $line
  1137. * @return int
  1138. */
  1139. protected function countTuples($line)
  1140. {
  1141. $freq = 0;
  1142. $parts = explode(':', $line);
  1143. foreach ($parts as $tuple) {
  1144. if ($tuple === '') continue;
  1145. [/* pid */, $cnt] = explode('*', $tuple);
  1146. $freq += (int)$cnt;
  1147. }
  1148. return $freq;
  1149. }
  1150. }