You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

1001 lines
31 KiB

  1. <?php
  2. /**
  3. * DokuWiki fulltextsearch functions using the index
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. */
  8. use dokuwiki\Utf8\Asian;
  9. use dokuwiki\Search\Indexer;
  10. use dokuwiki\Extension\Event;
  11. use dokuwiki\Utf8\Clean;
  12. use dokuwiki\Utf8\PhpString;
  13. use dokuwiki\Utf8\Sort;
  14. /**
  15. * create snippets for the first few results only
  16. */
  17. if (!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER', 15);
  18. /**
  19. * The fulltext search
  20. *
  21. * Returns a list of matching documents for the given query
  22. *
  23. * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
  24. *
  25. * @param string $query
  26. * @param array $highlight
  27. * @param string $sort
  28. * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments
  29. * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments
  30. *
  31. * @return array
  32. */
  33. function ft_pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
  34. {
  35. if ($sort === null) {
  36. $sort = 'hits';
  37. }
  38. $data = [
  39. 'query' => $query,
  40. 'sort' => $sort,
  41. 'after' => $after,
  42. 'before' => $before
  43. ];
  44. $data['highlight'] =& $highlight;
  45. return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
  46. }
  47. /**
  48. * Returns a list of matching documents for the given query
  49. *
  50. * @author Andreas Gohr <andi@splitbrain.org>
  51. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  52. *
  53. * @param array $data event data
  54. * @return array matching documents
  55. */
  56. function _ft_pageSearch(&$data)
  57. {
  58. $Indexer = idx_get_indexer();
  59. // parse the given query
  60. $q = ft_queryParser($Indexer, $data['query']);
  61. $data['highlight'] = $q['highlight'];
  62. if (empty($q['parsed_ary'])) return [];
  63. // lookup all words found in the query
  64. $lookup = $Indexer->lookup($q['words']);
  65. // get all pages in this dokuwiki site (!: includes nonexistent pages)
  66. $pages_all = [];
  67. foreach ($Indexer->getPages() as $id) {
  68. $pages_all[$id] = 0; // base: 0 hit
  69. }
  70. // process the query
  71. $stack = [];
  72. foreach ($q['parsed_ary'] as $token) {
  73. switch (substr($token, 0, 3)) {
  74. case 'W+:':
  75. case 'W-:':
  76. case 'W_:': // word
  77. $word = substr($token, 3);
  78. if (isset($lookup[$word])) {
  79. $stack[] = (array)$lookup[$word];
  80. }
  81. break;
  82. case 'P+:':
  83. case 'P-:': // phrase
  84. $phrase = substr($token, 3);
  85. // since phrases are always parsed as ((W1)(W2)...(P)),
  86. // the end($stack) always points the pages that contain
  87. // all words in this phrase
  88. $pages = end($stack);
  89. $pages_matched = [];
  90. foreach (array_keys($pages) as $id) {
  91. $evdata = [
  92. 'id' => $id,
  93. 'phrase' => $phrase,
  94. 'text' => rawWiki($id)
  95. ];
  96. $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
  97. if ($evt->advise_before() && $evt->result !== true) {
  98. $text = PhpString::strtolower($evdata['text']);
  99. if (strpos($text, $phrase) !== false) {
  100. $evt->result = true;
  101. }
  102. }
  103. $evt->advise_after();
  104. if ($evt->result === true) {
  105. $pages_matched[$id] = 0; // phrase: always 0 hit
  106. }
  107. }
  108. $stack[] = $pages_matched;
  109. break;
  110. case 'N+:':
  111. case 'N-:': // namespace
  112. $ns = cleanID(substr($token, 3)) . ':';
  113. $pages_matched = [];
  114. foreach (array_keys($pages_all) as $id) {
  115. if (strpos($id, $ns) === 0) {
  116. $pages_matched[$id] = 0; // namespace: always 0 hit
  117. }
  118. }
  119. $stack[] = $pages_matched;
  120. break;
  121. case 'AND': // and operation
  122. $pages = array_splice($stack, -2);
  123. if ($pages === []) {
  124. break;
  125. }
  126. $stack[] = ft_resultCombine($pages);
  127. break;
  128. case 'OR': // or operation
  129. $pages = array_splice($stack, -2);
  130. if ($pages === []) {
  131. break;
  132. }
  133. $stack[] = ft_resultUnite($pages);
  134. break;
  135. case 'NOT': // not operation (unary)
  136. $pages = array_pop($stack);
  137. $stack[] = ft_resultComplement([$pages_all, $pages]);
  138. break;
  139. }
  140. }
  141. $docs = array_pop($stack);
  142. if (empty($docs)) return [];
  143. // check: settings, acls, existence
  144. foreach (array_keys($docs) as $id) {
  145. if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
  146. unset($docs[$id]);
  147. }
  148. }
  149. $docs = _ft_filterResultsByTime($docs, $data['after'], $data['before']);
  150. if ($data['sort'] === 'mtime') {
  151. uksort($docs, 'ft_pagemtimesorter');
  152. } else {
  153. // sort docs by count
  154. uksort($docs, 'ft_pagesorter');
  155. arsort($docs);
  156. }
  157. return $docs;
  158. }
  159. /**
  160. * Returns the backlinks for a given page
  161. *
  162. * Uses the metadata index.
  163. *
  164. * @param string $id The id for which links shall be returned
  165. * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
  166. * @return array The pages that contain links to the given page
  167. */
  168. function ft_backlinks($id, $ignore_perms = false)
  169. {
  170. $result = idx_get_indexer()->lookupKey('relation_references', $id);
  171. if ($result === []) return $result;
  172. // check ACL permissions
  173. foreach (array_keys($result) as $idx) {
  174. if (
  175. (!$ignore_perms && (
  176. isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
  177. )) || !page_exists($result[$idx], '', false)
  178. ) {
  179. unset($result[$idx]);
  180. }
  181. }
  182. Sort::sort($result);
  183. return $result;
  184. }
  185. /**
  186. * Returns the pages that use a given media file
  187. *
  188. * Uses the relation media metadata property and the metadata index.
  189. *
  190. * Note that before 2013-07-31 the second parameter was the maximum number of results and
  191. * permissions were ignored. That's why the parameter is now checked to be explicitely set
  192. * to true (with type bool) in order to be compatible with older uses of the function.
  193. *
  194. * @param string $id The media id to look for
  195. * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false)
  196. * @return array A list of pages that use the given media file
  197. */
  198. function ft_mediause($id, $ignore_perms = false)
  199. {
  200. $result = idx_get_indexer()->lookupKey('relation_media', $id);
  201. if ($result === []) return $result;
  202. // check ACL permissions
  203. foreach (array_keys($result) as $idx) {
  204. if (
  205. (!$ignore_perms && (
  206. isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
  207. )) || !page_exists($result[$idx], '', false)
  208. ) {
  209. unset($result[$idx]);
  210. }
  211. }
  212. Sort::sort($result);
  213. return $result;
  214. }
  215. /**
  216. * Quicksearch for pagenames
  217. *
  218. * By default it only matches the pagename and ignores the
  219. * namespace. This can be changed with the second parameter.
  220. * The third parameter allows to search in titles as well.
  221. *
  222. * The function always returns titles as well
  223. *
  224. * @triggers SEARCH_QUERY_PAGELOOKUP
  225. * @author Andreas Gohr <andi@splitbrain.org>
  226. * @author Adrian Lang <lang@cosmocode.de>
  227. *
  228. * @param string $id page id
  229. * @param bool $in_ns match against namespace as well?
  230. * @param bool $in_title search in title?
  231. * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments
  232. * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments
  233. *
  234. * @return string[]
  235. */
  236. function ft_pageLookup($id, $in_ns = false, $in_title = false, $after = null, $before = null)
  237. {
  238. $data = [
  239. 'id' => $id,
  240. 'in_ns' => $in_ns,
  241. 'in_title' => $in_title,
  242. 'after' => $after,
  243. 'before' => $before
  244. ];
  245. $data['has_titles'] = true; // for plugin backward compatibility check
  246. return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup');
  247. }
  248. /**
  249. * Returns list of pages as array(pageid => First Heading)
  250. *
  251. * @param array &$data event data
  252. * @return string[]
  253. */
  254. function _ft_pageLookup(&$data)
  255. {
  256. // split out original parameters
  257. $id = $data['id'];
  258. $Indexer = idx_get_indexer();
  259. $parsedQuery = ft_queryParser($Indexer, $id);
  260. if (count($parsedQuery['ns']) > 0) {
  261. $ns = cleanID($parsedQuery['ns'][0]) . ':';
  262. $id = implode(' ', $parsedQuery['highlight']);
  263. }
  264. if (count($parsedQuery['notns']) > 0) {
  265. $notns = cleanID($parsedQuery['notns'][0]) . ':';
  266. $id = implode(' ', $parsedQuery['highlight']);
  267. }
  268. $in_ns = $data['in_ns'];
  269. $in_title = $data['in_title'];
  270. $cleaned = cleanID($id);
  271. $Indexer = idx_get_indexer();
  272. $page_idx = $Indexer->getPages();
  273. $pages = [];
  274. if ($id !== '' && $cleaned !== '') {
  275. foreach ($page_idx as $p_id) {
  276. if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
  277. if (!isset($pages[$p_id]))
  278. $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
  279. }
  280. }
  281. if ($in_title) {
  282. foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) {
  283. if (!isset($pages[$p_id]))
  284. $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
  285. }
  286. }
  287. }
  288. if (isset($ns)) {
  289. foreach (array_keys($pages) as $p_id) {
  290. if (strpos($p_id, $ns) !== 0) {
  291. unset($pages[$p_id]);
  292. }
  293. }
  294. }
  295. if (isset($notns)) {
  296. foreach (array_keys($pages) as $p_id) {
  297. if (strpos($p_id, $notns) === 0) {
  298. unset($pages[$p_id]);
  299. }
  300. }
  301. }
  302. // discard hidden pages
  303. // discard nonexistent pages
  304. // check ACL permissions
  305. foreach (array_keys($pages) as $idx) {
  306. if (
  307. !isVisiblePage($idx) || !page_exists($idx) ||
  308. auth_quickaclcheck($idx) < AUTH_READ
  309. ) {
  310. unset($pages[$idx]);
  311. }
  312. }
  313. $pages = _ft_filterResultsByTime($pages, $data['after'], $data['before']);
  314. uksort($pages, 'ft_pagesorter');
  315. return $pages;
  316. }
  317. /**
  318. * @param array $results search results in the form pageid => value
  319. * @param int|string $after only returns results with mtime after this date, accepts timestap or strtotime arguments
  320. * @param int|string $before only returns results with mtime after this date, accepts timestap or strtotime arguments
  321. *
  322. * @return array
  323. */
  324. function _ft_filterResultsByTime(array $results, $after, $before)
  325. {
  326. if ($after || $before) {
  327. $after = is_int($after) ? $after : strtotime($after);
  328. $before = is_int($before) ? $before : strtotime($before);
  329. foreach (array_keys($results) as $id) {
  330. $mTime = filemtime(wikiFN($id));
  331. if ($after && $after > $mTime) {
  332. unset($results[$id]);
  333. continue;
  334. }
  335. if ($before && $before < $mTime) {
  336. unset($results[$id]);
  337. }
  338. }
  339. }
  340. return $results;
  341. }
  342. /**
  343. * Tiny helper function for comparing the searched title with the title
  344. * from the search index. This function is a wrapper around stripos with
  345. * adapted argument order and return value.
  346. *
  347. * @param string $search searched title
  348. * @param string $title title from index
  349. * @return bool
  350. */
  351. function _ft_pageLookupTitleCompare($search, $title)
  352. {
  353. if (Clean::isASCII($search)) {
  354. $pos = stripos($title, $search);
  355. } else {
  356. $pos = PhpString::strpos(
  357. PhpString::strtolower($title),
  358. PhpString::strtolower($search)
  359. );
  360. }
  361. return $pos !== false;
  362. }
  363. /**
  364. * Sort pages based on their namespace level first, then on their string
  365. * values. This makes higher hierarchy pages rank higher than lower hierarchy
  366. * pages.
  367. *
  368. * @param string $a
  369. * @param string $b
  370. * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, and 0 if they are equal.
  371. */
  372. function ft_pagesorter($a, $b)
  373. {
  374. $ac = count(explode(':', $a));
  375. $bc = count(explode(':', $b));
  376. if ($ac < $bc) {
  377. return -1;
  378. } elseif ($ac > $bc) {
  379. return 1;
  380. }
  381. return Sort::strcmp($a, $b);
  382. }
  383. /**
  384. * Sort pages by their mtime, from newest to oldest
  385. *
  386. * @param string $a
  387. * @param string $b
  388. *
  389. * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a and 0 if they are of the same age
  390. */
  391. function ft_pagemtimesorter($a, $b)
  392. {
  393. $mtimeA = filemtime(wikiFN($a));
  394. $mtimeB = filemtime(wikiFN($b));
  395. return $mtimeB - $mtimeA;
  396. }
  397. /**
  398. * Creates a snippet extract
  399. *
  400. * @author Andreas Gohr <andi@splitbrain.org>
  401. * @triggers FULLTEXT_SNIPPET_CREATE
  402. *
  403. * @param string $id page id
  404. * @param array $highlight
  405. * @return mixed
  406. */
  407. function ft_snippet($id, $highlight)
  408. {
  409. $text = rawWiki($id);
  410. $text = str_replace("\xC2\xAD", '', $text);
  411. // remove soft-hyphens
  412. $evdata = [
  413. 'id' => $id,
  414. 'text' => &$text,
  415. 'highlight' => &$highlight,
  416. 'snippet' => ''
  417. ];
  418. $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
  419. if ($evt->advise_before()) {
  420. $match = [];
  421. $snippets = [];
  422. $utf8_offset = 0;
  423. $offset = 0;
  424. $end = 0;
  425. $len = PhpString::strlen($text);
  426. // build a regexp from the phrases to highlight
  427. $re1 = '(' .
  428. implode(
  429. '|',
  430. array_map(
  431. 'ft_snippet_re_preprocess',
  432. array_map(
  433. 'preg_quote_cb',
  434. array_filter((array) $highlight)
  435. )
  436. )
  437. ) .
  438. ')';
  439. $re2 = "$re1.{0,75}(?!\\1)$re1";
  440. $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
  441. for ($cnt = 4; $cnt--;) {
  442. if (0) {
  443. } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
  444. } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
  445. } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
  446. } else {
  447. break;
  448. }
  449. [$str, $idx] = $match[0];
  450. // convert $idx (a byte offset) into a utf8 character offset
  451. $utf8_idx = PhpString::strlen(substr($text, 0, $idx));
  452. $utf8_len = PhpString::strlen($str);
  453. // establish context, 100 bytes surrounding the match string
  454. // first look to see if we can go 100 either side,
  455. // then drop to 50 adding any excess if the other side can't go to 50,
  456. $pre = min($utf8_idx - $utf8_offset, 100);
  457. $post = min($len - $utf8_idx - $utf8_len, 100);
  458. if ($pre > 50 && $post > 50) {
  459. $pre = 50;
  460. $post = 50;
  461. } elseif ($pre > 50) {
  462. $pre = min($pre, 100 - $post);
  463. } elseif ($post > 50) {
  464. $post = min($post, 100 - $pre);
  465. } elseif ($offset == 0) {
  466. // both are less than 50, means the context is the whole string
  467. // make it so and break out of this loop - there is no need for the
  468. // complex snippet calculations
  469. $snippets = [$text];
  470. break;
  471. }
  472. // establish context start and end points, try to append to previous
  473. // context if possible
  474. $start = $utf8_idx - $pre;
  475. $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
  476. $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
  477. if ($append) {
  478. $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append);
  479. } else {
  480. $snippets[] = PhpString::substr($text, $start, $end - $start);
  481. }
  482. // set $offset for next match attempt
  483. // continue matching after the current match
  484. // if the current match is not the longest possible match starting at the current offset
  485. // this prevents further matching of this snippet but for possible matches of length
  486. // smaller than match length + context (at least 50 characters) this match is part of the context
  487. $utf8_offset = $utf8_idx + $utf8_len;
  488. $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len));
  489. $offset = Clean::correctIdx($text, $offset);
  490. }
  491. $m = "\1";
  492. $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets);
  493. $snippet = preg_replace(
  494. '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
  495. '<strong class="search_hit">$1</strong>',
  496. hsc(implode('... ', $snippets))
  497. );
  498. $evdata['snippet'] = $snippet;
  499. }
  500. $evt->advise_after();
  501. unset($evt);
  502. return $evdata['snippet'];
  503. }
  504. /**
  505. * Wraps a search term in regex boundary checks.
  506. *
  507. * @param string $term
  508. * @return string
  509. */
  510. function ft_snippet_re_preprocess($term)
  511. {
  512. // do not process asian terms where word boundaries are not explicit
  513. if (Asian::isAsianWords($term)) return $term;
  514. if (UTF8_PROPERTYSUPPORT) {
  515. // unicode word boundaries
  516. // see http://stackoverflow.com/a/2449017/172068
  517. $BL = '(?<!\pL)';
  518. $BR = '(?!\pL)';
  519. } else {
  520. // not as correct as above, but at least won't break
  521. $BL = '\b';
  522. $BR = '\b';
  523. }
  524. if (str_starts_with($term, '\\*')) {
  525. $term = substr($term, 2);
  526. } else {
  527. $term = $BL . $term;
  528. }
  529. if (str_ends_with($term, '\\*')) {
  530. $term = substr($term, 0, -2);
  531. } else {
  532. $term .= $BR;
  533. }
  534. if ($term == $BL || $term == $BR || $term == $BL . $BR) $term = '';
  535. return $term;
  536. }
  537. /**
  538. * Combine found documents and sum up their scores
  539. *
  540. * This function is used to combine searched words with a logical
  541. * AND. Only documents available in all arrays are returned.
  542. *
  543. * based upon PEAR's PHP_Compat function for array_intersect_key()
  544. *
  545. * @param array $args An array of page arrays
  546. * @return array
  547. */
  548. function ft_resultCombine($args)
  549. {
  550. $array_count = count($args);
  551. if ($array_count == 1) {
  552. return $args[0];
  553. }
  554. $result = [];
  555. if ($array_count > 1) {
  556. foreach ($args[0] as $key => $value) {
  557. $result[$key] = $value;
  558. for ($i = 1; $i !== $array_count; $i++) {
  559. if (!isset($args[$i][$key])) {
  560. unset($result[$key]);
  561. break;
  562. }
  563. $result[$key] += $args[$i][$key];
  564. }
  565. }
  566. }
  567. return $result;
  568. }
  569. /**
  570. * Unites found documents and sum up their scores
  571. *
  572. * based upon ft_resultCombine() function
  573. *
  574. * @param array $args An array of page arrays
  575. * @return array
  576. *
  577. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  578. */
  579. function ft_resultUnite($args)
  580. {
  581. $array_count = count($args);
  582. if ($array_count === 1) {
  583. return $args[0];
  584. }
  585. $result = $args[0];
  586. for ($i = 1; $i !== $array_count; $i++) {
  587. foreach (array_keys($args[$i]) as $id) {
  588. $result[$id] += $args[$i][$id];
  589. }
  590. }
  591. return $result;
  592. }
  593. /**
  594. * Computes the difference of documents using page id for comparison
  595. *
  596. * nearly identical to PHP5's array_diff_key()
  597. *
  598. * @param array $args An array of page arrays
  599. * @return array
  600. *
  601. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  602. */
  603. function ft_resultComplement($args)
  604. {
  605. $array_count = count($args);
  606. if ($array_count === 1) {
  607. return $args[0];
  608. }
  609. $result = $args[0];
  610. foreach (array_keys($result) as $id) {
  611. for ($i = 1; $i !== $array_count; $i++) {
  612. if (isset($args[$i][$id])) unset($result[$id]);
  613. }
  614. }
  615. return $result;
  616. }
  617. /**
  618. * Parses a search query and builds an array of search formulas
  619. *
  620. * @author Andreas Gohr <andi@splitbrain.org>
  621. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  622. *
  623. * @param Indexer $Indexer
  624. * @param string $query search query
  625. * @return array of search formulas
  626. */
  627. function ft_queryParser($Indexer, $query)
  628. {
  629. /**
  630. * parse a search query and transform it into intermediate representation
  631. *
  632. * in a search query, you can use the following expressions:
  633. *
  634. * words:
  635. * include
  636. * -exclude
  637. * phrases:
  638. * "phrase to be included"
  639. * -"phrase you want to exclude"
  640. * namespaces:
  641. * @include:namespace (or ns:include:namespace)
  642. * ^exclude:namespace (or -ns:exclude:namespace)
  643. * groups:
  644. * ()
  645. * -()
  646. * operators:
  647. * and ('and' is the default operator: you can always omit this)
  648. * or (or pipe symbol '|', lower precedence than 'and')
  649. *
  650. * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
  651. * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
  652. * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
  653. * as long as you don't mind hit counts.
  654. *
  655. * intermediate representation consists of the following parts:
  656. *
  657. * ( ) - group
  658. * AND - logical and
  659. * OR - logical or
  660. * NOT - logical not
  661. * W+:, W-:, W_: - word (underscore: no need to highlight)
  662. * P+:, P-: - phrase (minus sign: logically in NOT group)
  663. * N+:, N-: - namespace
  664. */
  665. $parsed_query = '';
  666. $parens_level = 0;
  667. $terms = preg_split(
  668. '/(-?".*?")/u',
  669. PhpString::strtolower($query),
  670. -1,
  671. PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
  672. );
  673. foreach ($terms as $term) {
  674. $parsed = '';
  675. if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
  676. // phrase-include and phrase-exclude
  677. $not = $matches[1] ? 'NOT' : '';
  678. $parsed = $not . ft_termParser($Indexer, $matches[2], false, true);
  679. } else {
  680. // fix incomplete phrase
  681. $term = str_replace('"', ' ', $term);
  682. // fix parentheses
  683. $term = str_replace(')', ' ) ', $term);
  684. $term = str_replace('(', ' ( ', $term);
  685. $term = str_replace('- (', ' -(', $term);
  686. // treat pipe symbols as 'OR' operators
  687. $term = str_replace('|', ' or ', $term);
  688. // treat ideographic spaces (U+3000) as search term separators
  689. // FIXME: some more separators?
  690. $term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
  691. $term = trim($term);
  692. if ($term === '') continue;
  693. $tokens = explode(' ', $term);
  694. foreach ($tokens as $token) {
  695. if ($token === '(') {
  696. // parenthesis-include-open
  697. $parsed .= '(';
  698. ++$parens_level;
  699. } elseif ($token === '-(') {
  700. // parenthesis-exclude-open
  701. $parsed .= 'NOT(';
  702. ++$parens_level;
  703. } elseif ($token === ')') {
  704. // parenthesis-any-close
  705. if ($parens_level === 0) continue;
  706. $parsed .= ')';
  707. $parens_level--;
  708. } elseif ($token === 'and') {
  709. // logical-and (do nothing)
  710. } elseif ($token === 'or') {
  711. // logical-or
  712. $parsed .= 'OR';
  713. } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
  714. // namespace-exclude
  715. $parsed .= 'NOT(N+:' . $matches[1] . ')';
  716. } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
  717. // namespace-include
  718. $parsed .= '(N+:' . $matches[1] . ')';
  719. } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
  720. // word-exclude
  721. $parsed .= 'NOT(' . ft_termParser($Indexer, $matches[1]) . ')';
  722. } else {
  723. // word-include
  724. $parsed .= ft_termParser($Indexer, $token);
  725. }
  726. }
  727. }
  728. $parsed_query .= $parsed;
  729. }
  730. // cleanup (very sensitive)
  731. $parsed_query .= str_repeat(')', $parens_level);
  732. do {
  733. $parsed_query_old = $parsed_query;
  734. $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
  735. } while ($parsed_query !== $parsed_query_old);
  736. $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')', $parsed_query);
  737. $parsed_query = preg_replace('/(OR)+/u', 'OR', $parsed_query);
  738. $parsed_query = preg_replace('/\(OR/u', '(', $parsed_query);
  739. $parsed_query = preg_replace('/^OR|OR$/u', '', $parsed_query);
  740. $parsed_query = preg_replace('/\)(NOT)?\(/u', ')AND$1(', $parsed_query);
  741. // adjustment: make highlightings right
  742. $parens_level = 0;
  743. $notgrp_levels = [];
  744. $parsed_query_new = '';
  745. $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  746. foreach ($tokens as $token) {
  747. if ($token === 'NOT(') {
  748. $notgrp_levels[] = ++$parens_level;
  749. } elseif ($token === '(') {
  750. ++$parens_level;
  751. } elseif ($token === ')') {
  752. if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
  753. } elseif (count($notgrp_levels) % 2 === 1) {
  754. // turn highlight-flag off if terms are logically in "NOT" group
  755. $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
  756. }
  757. $parsed_query_new .= $token;
  758. }
  759. $parsed_query = $parsed_query_new;
  760. /**
  761. * convert infix notation string into postfix (Reverse Polish notation) array
  762. * by Shunting-yard algorithm
  763. *
  764. * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
  765. * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
  766. */
  767. $parsed_ary = [];
  768. $ope_stack = [];
  769. $ope_precedence = [')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5];
  770. $ope_regex = '/([()]|OR|AND|NOT)/u';
  771. $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  772. foreach ($tokens as $token) {
  773. if (preg_match($ope_regex, $token)) {
  774. // operator
  775. $last_ope = end($ope_stack);
  776. while ($last_ope !== false && $ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
  777. $parsed_ary[] = array_pop($ope_stack);
  778. $last_ope = end($ope_stack);
  779. }
  780. if ($token == ')') {
  781. array_pop($ope_stack); // this array_pop always deletes '('
  782. } else {
  783. $ope_stack[] = $token;
  784. }
  785. } else {
  786. // operand
  787. $token_decoded = str_replace(['OP', 'CP'], ['(', ')'], $token);
  788. $parsed_ary[] = $token_decoded;
  789. }
  790. }
  791. $parsed_ary = array_values([...$parsed_ary, ...array_reverse($ope_stack)]);
  792. // cleanup: each double "NOT" in RPN array actually does nothing
  793. $parsed_ary_count = count($parsed_ary);
  794. for ($i = 1; $i < $parsed_ary_count; ++$i) {
  795. if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
  796. unset($parsed_ary[$i], $parsed_ary[$i - 1]);
  797. }
  798. }
  799. $parsed_ary = array_values($parsed_ary);
  800. // build return value
  801. $q = [];
  802. $q['query'] = $query;
  803. $q['parsed_str'] = $parsed_query;
  804. $q['parsed_ary'] = $parsed_ary;
  805. foreach ($q['parsed_ary'] as $token) {
  806. if (strlen($token) < 3 || $token[2] !== ':') continue;
  807. $body = substr($token, 3);
  808. switch (substr($token, 0, 3)) {
  809. case 'N+:':
  810. $q['ns'][] = $body; // for backward compatibility
  811. break;
  812. case 'N-:':
  813. $q['notns'][] = $body; // for backward compatibility
  814. break;
  815. case 'W_:':
  816. $q['words'][] = $body;
  817. break;
  818. case 'W-:':
  819. $q['words'][] = $body;
  820. $q['not'][] = $body; // for backward compatibility
  821. break;
  822. case 'W+:':
  823. $q['words'][] = $body;
  824. $q['highlight'][] = $body;
  825. $q['and'][] = $body; // for backward compatibility
  826. break;
  827. case 'P-:':
  828. $q['phrases'][] = $body;
  829. break;
  830. case 'P+:':
  831. $q['phrases'][] = $body;
  832. $q['highlight'][] = $body;
  833. break;
  834. }
  835. }
  836. foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
  837. $q[$key] = empty($q[$key]) ? [] : array_values(array_unique($q[$key]));
  838. }
  839. return $q;
  840. }
  841. /**
  842. * Transforms given search term into intermediate representation
  843. *
  844. * This function is used in ft_queryParser() and not for general purpose use.
  845. *
  846. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  847. *
  848. * @param Indexer $Indexer
  849. * @param string $term
  850. * @param bool $consider_asian
  851. * @param bool $phrase_mode
  852. * @return string
  853. */
  854. function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false)
  855. {
  856. $parsed = '';
  857. if ($consider_asian) {
  858. // successive asian characters need to be searched as a phrase
  859. $words = Asian::splitAsianWords($term);
  860. foreach ($words as $word) {
  861. $phrase_mode = $phrase_mode ? true : Asian::isAsianWords($word);
  862. $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
  863. }
  864. } else {
  865. $term_noparen = str_replace(['(', ')'], ' ', $term);
  866. $words = $Indexer->tokenizer($term_noparen, true);
  867. // W_: no need to highlight
  868. if (empty($words)) {
  869. $parsed = '()'; // important: do not remove
  870. } elseif ($words[0] === $term) {
  871. $parsed = '(W+:' . $words[0] . ')';
  872. } elseif ($phrase_mode) {
  873. $term_encoded = str_replace(['(', ')'], ['OP', 'CP'], $term);
  874. $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))';
  875. } else {
  876. $parsed = '((W+:' . implode(')(W+:', $words) . '))';
  877. }
  878. }
  879. return $parsed;
  880. }
  881. /**
  882. * Recreate a search query string based on parsed parts, doesn't support negated phrases and `OR` searches
  883. *
  884. * @param array $and
  885. * @param array $not
  886. * @param array $phrases
  887. * @param array $ns
  888. * @param array $notns
  889. *
  890. * @return string
  891. */
  892. function ft_queryUnparser_simple(array $and, array $not, array $phrases, array $ns, array $notns)
  893. {
  894. $query = implode(' ', $and);
  895. if ($not !== []) {
  896. $query .= ' -' . implode(' -', $not);
  897. }
  898. if ($phrases !== []) {
  899. $query .= ' "' . implode('" "', $phrases) . '"';
  900. }
  901. if ($ns !== []) {
  902. $query .= ' @' . implode(' @', $ns);
  903. }
  904. if ($notns !== []) {
  905. $query .= ' ^' . implode(' ^', $notns);
  906. }
  907. return $query;
  908. }
  909. //Setup VIM: ex: et ts=4 :