You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

589 lines
18 KiB

  1. <?php
  2. /**
  3. * DokuWiki search functions
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. */
  8. use dokuwiki\Utf8\PhpString;
  9. use dokuwiki\File\MediaFile;
  10. use dokuwiki\Utf8\Sort;
  11. /**
  12. * Recurse directory
  13. *
  14. * This function recurses into a given base directory
  15. * and calls the supplied function for each file and directory
  16. *
  17. * @param array &$data The results of the search are stored here
  18. * @param string $base Where to start the search
  19. * @param callback $func Callback (function name or array with object,method)
  20. * @param array $opts option array will be given to the Callback
  21. * @param string $dir Current directory beyond $base
  22. * @param int $lvl Recursion Level
  23. * @param mixed $sort 'natural' to use natural order sorting (default);
  24. * 'date' to sort by filemtime; leave empty to skip sorting.
  25. * @author Andreas Gohr <andi@splitbrain.org>
  26. */
  27. function search(&$data, $base, $func, $opts, $dir = '', $lvl = 1, $sort = 'natural')
  28. {
  29. $dirs = [];
  30. $files = [];
  31. $filepaths = [];
  32. // safeguard against runaways #1452
  33. if ($base == '' || $base == '/') {
  34. throw new RuntimeException('No valid $base passed to search() - possible misconfiguration or bug');
  35. }
  36. //read in directories and files
  37. $dh = @opendir($base . '/' . $dir);
  38. if (!$dh) return;
  39. while (($file = readdir($dh)) !== false) {
  40. if (preg_match('/^[\._]/', $file)) continue; //skip hidden files and upper dirs
  41. if (is_dir($base . '/' . $dir . '/' . $file)) {
  42. $dirs[] = $dir . '/' . $file;
  43. continue;
  44. }
  45. $files[] = $dir . '/' . $file;
  46. $filepaths[] = $base . '/' . $dir . '/' . $file;
  47. }
  48. closedir($dh);
  49. if (!empty($sort)) {
  50. if ($sort == 'date') {
  51. @array_multisort(array_map('filemtime', $filepaths), SORT_NUMERIC, SORT_DESC, $files);
  52. } else /* natural */ {
  53. Sort::asortFN($files);
  54. }
  55. Sort::asortFN($dirs);
  56. }
  57. //give directories to userfunction then recurse
  58. foreach ($dirs as $dir) {
  59. if (call_user_func_array($func, [&$data, $base, $dir, 'd', $lvl, $opts])) {
  60. search($data, $base, $func, $opts, $dir, $lvl + 1, $sort);
  61. }
  62. }
  63. //now handle the files
  64. foreach ($files as $file) {
  65. call_user_func_array($func, [&$data, $base, $file, 'f', $lvl, $opts]);
  66. }
  67. }
  68. /**
  69. * The following functions are userfunctions to use with the search
  70. * function above. This function is called for every found file or
  71. * directory. When a directory is given to the function it has to
  72. * decide if this directory should be traversed (true) or not (false)
  73. * The function has to accept the following parameters:
  74. *
  75. * array &$data - Reference to the result data structure
  76. * string $base - Base usually $conf['datadir']
  77. * string $file - current file or directory relative to $base
  78. * string $type - Type either 'd' for directory or 'f' for file
  79. * int $lvl - Current recursion depht
  80. * array $opts - option array as given to search()
  81. *
  82. * return values for files are ignored
  83. *
  84. * All functions should check the ACL for document READ rights
  85. * namespaces (directories) are NOT checked (when sneaky_index is 0) as this
  86. * would break the recursion (You can have an nonreadable dir over a readable
  87. * one deeper nested) also make sure to check the file type (for example
  88. * in case of lockfiles).
  89. */
  90. /**
  91. * Searches for pages beginning with the given query
  92. *
  93. * @author Andreas Gohr <andi@splitbrain.org>
  94. *
  95. * @param array $data
  96. * @param string $base
  97. * @param string $file
  98. * @param string $type
  99. * @param integer $lvl
  100. * @param array $opts
  101. *
  102. * @return bool
  103. */
  104. function search_qsearch(&$data, $base, $file, $type, $lvl, $opts)
  105. {
  106. $opts = [
  107. 'idmatch' => '(^|:)' . preg_quote($opts['query'], '/') . '/',
  108. 'listfiles' => true,
  109. 'pagesonly' => true
  110. ];
  111. return search_universal($data, $base, $file, $type, $lvl, $opts);
  112. }
  113. /**
  114. * Build the browsable index of pages
  115. *
  116. * $opts['ns'] is the currently viewed namespace
  117. *
  118. * @author Andreas Gohr <andi@splitbrain.org>
  119. *
  120. * @param array $data
  121. * @param string $base
  122. * @param string $file
  123. * @param string $type
  124. * @param integer $lvl
  125. * @param array $opts
  126. *
  127. * @return bool
  128. */
  129. function search_index(&$data, $base, $file, $type, $lvl, $opts)
  130. {
  131. global $conf;
  132. $ns = $opts['ns'] ?? '';
  133. $opts = [
  134. 'pagesonly' => true,
  135. 'listdirs' => true,
  136. 'listfiles' => empty($opts['nofiles']),
  137. 'sneakyacl' => $conf['sneaky_index'],
  138. // Hacky, should rather use recmatch
  139. 'depth' => preg_match('#^' . preg_quote($file, '#') . '(/|$)#', '/' . $ns) ? 0 : -1,
  140. ];
  141. return search_universal($data, $base, $file, $type, $lvl, $opts);
  142. }
  143. /**
  144. * List all namespaces
  145. *
  146. * @author Andreas Gohr <andi@splitbrain.org>
  147. *
  148. * @param array $data
  149. * @param string $base
  150. * @param string $file
  151. * @param string $type
  152. * @param integer $lvl
  153. * @param array $opts
  154. *
  155. * @return bool
  156. */
  157. function search_namespaces(&$data, $base, $file, $type, $lvl, $opts)
  158. {
  159. $opts = ['listdirs' => true];
  160. return search_universal($data, $base, $file, $type, $lvl, $opts);
  161. }
  162. /**
  163. * List all mediafiles in a namespace
  164. * $opts['depth'] recursion level, 0 for all
  165. * $opts['showmsg'] shows message if invalid media id is used
  166. * $opts['skipacl'] skip acl checking
  167. * $opts['pattern'] check given pattern
  168. * $opts['hash'] add hashes to result list
  169. *
  170. * @author Andreas Gohr <andi@splitbrain.org>
  171. *
  172. * @param array $data
  173. * @param string $base
  174. * @param string $file
  175. * @param string $type
  176. * @param integer $lvl
  177. * @param array $opts
  178. *
  179. * @return bool
  180. */
  181. function search_media(&$data, $base, $file, $type, $lvl, $opts)
  182. {
  183. //we do nothing with directories
  184. if ($type == 'd') {
  185. if (empty($opts['depth'])) return true; // recurse forever
  186. $depth = substr_count($file, '/');
  187. if ($depth >= $opts['depth']) return false; // depth reached
  188. return true;
  189. }
  190. $info = [];
  191. $info['id'] = pathID($file, true);
  192. if ($info['id'] !== cleanID($info['id'])) {
  193. if (!empty($opts['showmsg']))
  194. msg(hsc($info['id']) . ' is not a valid file name for DokuWiki - skipped', -1);
  195. return false; // skip non-valid files
  196. }
  197. //check ACL for namespace (we have no ACL for mediafiles)
  198. $info['perm'] = auth_quickaclcheck(getNS($info['id']) . ':*');
  199. if (empty($opts['skipacl']) && $info['perm'] < AUTH_READ) {
  200. return false;
  201. }
  202. //check pattern filter
  203. if (!empty($opts['pattern']) && !@preg_match($opts['pattern'], $info['id'])) {
  204. return false;
  205. }
  206. $info['file'] = PhpString::basename($file);
  207. $info['size'] = filesize($base . '/' . $file);
  208. $info['mtime'] = filemtime($base . '/' . $file);
  209. $info['writable'] = is_writable($base . '/' . $file);
  210. if (preg_match("/\.(jpe?g|gif|png)$/", $file)) {
  211. $info['isimg'] = true;
  212. $info['meta'] = new JpegMeta($base . '/' . $file);
  213. } else {
  214. $info['isimg'] = false;
  215. }
  216. if (!empty($opts['hash'])) {
  217. $info['hash'] = md5(io_readFile(mediaFN($info['id']), false));
  218. }
  219. $data[] = $info;
  220. return false;
  221. }
  222. /**
  223. * List all mediafiles in a namespace
  224. * $opts['depth'] recursion level, 0 for all
  225. * $opts['showmsg'] shows message if invalid media id is used
  226. * $opts['skipacl'] skip acl checking
  227. * $opts['pattern'] check given pattern
  228. * $opts['hash'] add hashes to result list
  229. *
  230. * @todo This is a temporary copy of search_media returning a list of MediaFile intances
  231. *
  232. * @param array $data
  233. * @param string $base
  234. * @param string $file
  235. * @param string $type
  236. * @param integer $lvl
  237. * @param array $opts
  238. *
  239. * @return bool
  240. */
  241. function search_mediafiles(&$data, $base, $file, $type, $lvl, $opts)
  242. {
  243. //we do nothing with directories
  244. if ($type == 'd') {
  245. if (empty($opts['depth'])) return true; // recurse forever
  246. $depth = substr_count($file, '/');
  247. if ($depth >= $opts['depth']) return false; // depth reached
  248. return true;
  249. }
  250. $id = pathID($file, true);
  251. if ($id != cleanID($id)) {
  252. if ($opts['showmsg'])
  253. msg(hsc($id) . ' is not a valid file name for DokuWiki - skipped', -1);
  254. return false; // skip non-valid files
  255. }
  256. //check ACL for namespace (we have no ACL for mediafiles)
  257. $info['perm'] = auth_quickaclcheck(getNS($id) . ':*');
  258. if (empty($opts['skipacl']) && $info['perm'] < AUTH_READ) {
  259. return false;
  260. }
  261. //check pattern filter
  262. if (!empty($opts['pattern']) && !@preg_match($opts['pattern'], $id)) {
  263. return false;
  264. }
  265. $data[] = new MediaFile($id);
  266. return false;
  267. }
  268. /**
  269. * This function just lists documents (for RSS namespace export)
  270. *
  271. * @author Andreas Gohr <andi@splitbrain.org>
  272. *
  273. * @param array $data
  274. * @param string $base
  275. * @param string $file
  276. * @param string $type
  277. * @param integer $lvl
  278. * @param array $opts
  279. *
  280. * @return bool
  281. */
  282. function search_list(&$data, $base, $file, $type, $lvl, $opts)
  283. {
  284. //we do nothing with directories
  285. if ($type == 'd') return false;
  286. //only search txt files
  287. if (str_ends_with($file, '.txt')) {
  288. //check ACL
  289. $id = pathID($file);
  290. if (auth_quickaclcheck($id) < AUTH_READ) {
  291. return false;
  292. }
  293. $data[]['id'] = $id;
  294. }
  295. return false;
  296. }
  297. /**
  298. * Quicksearch for searching matching pagenames
  299. *
  300. * $opts['query'] is the search query
  301. *
  302. * @author Andreas Gohr <andi@splitbrain.org>
  303. *
  304. * @param array $data
  305. * @param string $base
  306. * @param string $file
  307. * @param string $type
  308. * @param integer $lvl
  309. * @param array $opts
  310. *
  311. * @return bool
  312. */
  313. function search_pagename(&$data, $base, $file, $type, $lvl, $opts)
  314. {
  315. //we do nothing with directories
  316. if ($type == 'd') return true;
  317. //only search txt files
  318. if (!str_ends_with($file, '.txt')) return true;
  319. //simple stringmatching
  320. if (!empty($opts['query'])) {
  321. if (strpos($file, (string) $opts['query']) !== false) {
  322. //check ACL
  323. $id = pathID($file);
  324. if (auth_quickaclcheck($id) < AUTH_READ) {
  325. return false;
  326. }
  327. $data[]['id'] = $id;
  328. }
  329. }
  330. return true;
  331. }
  332. /**
  333. * Just lists all documents
  334. *
  335. * $opts['depth'] recursion level, 0 for all
  336. * $opts['hash'] do md5 sum of content?
  337. * $opts['skipacl'] list everything regardless of ACL
  338. *
  339. * @author Andreas Gohr <andi@splitbrain.org>
  340. *
  341. * @param array $data
  342. * @param string $base
  343. * @param string $file
  344. * @param string $type
  345. * @param integer $lvl
  346. * @param array $opts
  347. *
  348. * @return bool
  349. */
  350. function search_allpages(&$data, $base, $file, $type, $lvl, $opts)
  351. {
  352. if (($opts['depth'] ?? 0) > 0) {
  353. $parts = explode('/', ltrim($file, '/'));
  354. if (
  355. ($type == 'd' && count($parts) >= $opts['depth'])
  356. || ($type != 'd' && count($parts) > $opts['depth'])
  357. ) {
  358. return false; // depth reached
  359. }
  360. }
  361. //we do nothing with directories
  362. if ($type == 'd') {
  363. return true;
  364. }
  365. //only search txt files
  366. if (!str_ends_with($file, '.txt')) return true;
  367. $item = [];
  368. $item['id'] = pathID($file);
  369. if (empty($opts['skipacl']) && auth_quickaclcheck($item['id']) < AUTH_READ) {
  370. return false;
  371. }
  372. $item['rev'] = filemtime($base . '/' . $file);
  373. $item['mtime'] = $item['rev'];
  374. $item['size'] = filesize($base . '/' . $file);
  375. if (!empty($opts['hash'])) {
  376. $item['hash'] = md5(trim(rawWiki($item['id'])));
  377. }
  378. $data[] = $item;
  379. return true;
  380. }
  381. /* ------------- helper functions below -------------- */
  382. /**
  383. * fulltext sort
  384. *
  385. * Callback sort function for use with usort to sort the data
  386. * structure created by search_fulltext. Sorts descending by count
  387. *
  388. * @author Andreas Gohr <andi@splitbrain.org>
  389. *
  390. * @param array $a
  391. * @param array $b
  392. *
  393. * @return int
  394. */
  395. function sort_search_fulltext($a, $b)
  396. {
  397. if ($a['count'] > $b['count']) {
  398. return -1;
  399. } elseif ($a['count'] < $b['count']) {
  400. return 1;
  401. } else {
  402. return Sort::strcmp($a['id'], $b['id']);
  403. }
  404. }
  405. /**
  406. * translates a document path to an ID
  407. *
  408. * @author Andreas Gohr <andi@splitbrain.org>
  409. * @todo move to pageutils
  410. *
  411. * @param string $path
  412. * @param bool $keeptxt
  413. *
  414. * @return string
  415. */
  416. function pathID($path, $keeptxt = false)
  417. {
  418. $id = utf8_decodeFN($path);
  419. $id = str_replace('/', ':', $id);
  420. if (!$keeptxt) $id = preg_replace('#\.txt$#', '', $id);
  421. $id = trim($id, ':');
  422. return $id;
  423. }
  424. /**
  425. * This is a very universal callback for the search() function, replacing
  426. * many of the former individual functions at the cost of a more complex
  427. * setup.
  428. *
  429. * How the function behaves, depends on the options passed in the $opts
  430. * array, where the following settings can be used.
  431. *
  432. * depth int recursion depth. 0 for unlimited (default: 0)
  433. * keeptxt bool keep .txt extension for IDs (default: false)
  434. * listfiles bool include files in listing (default: false)
  435. * listdirs bool include namespaces in listing (default: false)
  436. * pagesonly bool restrict files to pages (default: false)
  437. * skipacl bool do not check for READ permission (default: false)
  438. * sneakyacl bool don't recurse into nonreadable dirs (default: false)
  439. * hash bool create MD5 hash for files (default: false)
  440. * meta bool return file metadata (default: false)
  441. * filematch string match files against this regexp (default: '', so accept everything)
  442. * idmatch string match full ID against this regexp (default: '', so accept everything)
  443. * dirmatch string match directory against this regexp when adding (default: '', so accept everything)
  444. * nsmatch string match namespace against this regexp when adding (default: '', so accept everything)
  445. * recmatch string match directory against this regexp when recursing (default: '', so accept everything)
  446. * showmsg bool warn about non-ID files (default: false)
  447. * showhidden bool show hidden files(e.g. by hidepages config) too (default: false)
  448. * firsthead bool return first heading for pages (default: false)
  449. *
  450. * @param array &$data - Reference to the result data structure
  451. * @param string $base - Base usually $conf['datadir']
  452. * @param string $file - current file or directory relative to $base
  453. * @param string $type - Type either 'd' for directory or 'f' for file
  454. * @param int $lvl - Current recursion depht
  455. * @param array $opts - option array as given to search()
  456. * @return bool if this directory should be traversed (true) or not (false)
  457. * return value is ignored for files
  458. *
  459. * @author Andreas Gohr <gohr@cosmocode.de>
  460. */
  461. function search_universal(&$data, $base, $file, $type, $lvl, $opts)
  462. {
  463. $item = [];
  464. $return = true;
  465. // get ID and check if it is a valid one
  466. $item['id'] = pathID($file, ($type == 'd' || !empty($opts['keeptxt'])));
  467. if ($item['id'] !== cleanID($item['id'])) {
  468. if (!empty($opts['showmsg'])) {
  469. msg(hsc($item['id']) . ' is not a valid file name for DokuWiki - skipped', -1);
  470. }
  471. return false; // skip non-valid files
  472. }
  473. $item['ns'] = getNS($item['id']);
  474. if ($type == 'd') {
  475. // decide if to recursion into this directory is wanted
  476. if (empty($opts['depth'])) {
  477. $return = true; // recurse forever
  478. } else {
  479. $depth = substr_count($file, '/');
  480. if ($depth >= $opts['depth']) {
  481. $return = false; // depth reached
  482. } else {
  483. $return = true;
  484. }
  485. }
  486. if ($return) {
  487. $match = empty($opts['recmatch']) || preg_match('/' . $opts['recmatch'] . '/', $file);
  488. if (!$match) {
  489. return false; // doesn't match
  490. }
  491. }
  492. }
  493. // check ACL
  494. if (empty($opts['skipacl'])) {
  495. if ($type == 'd') {
  496. $item['perm'] = auth_quickaclcheck($item['id'] . ':*');
  497. } else {
  498. $item['perm'] = auth_quickaclcheck($item['id']); //FIXME check namespace for media files
  499. }
  500. } else {
  501. $item['perm'] = AUTH_DELETE;
  502. }
  503. // are we done here maybe?
  504. if ($type == 'd') {
  505. if (empty($opts['listdirs'])) return $return;
  506. //neither list nor recurse forbidden items:
  507. if (empty($opts['skipacl']) && !empty($opts['sneakyacl']) && $item['perm'] < AUTH_READ) return false;
  508. if (!empty($opts['dirmatch']) && !preg_match('/' . $opts['dirmatch'] . '/', $file)) return $return;
  509. if (!empty($opts['nsmatch']) && !preg_match('/' . $opts['nsmatch'] . '/', $item['ns'])) return $return;
  510. } else {
  511. if (empty($opts['listfiles'])) return $return;
  512. if (empty($opts['skipacl']) && $item['perm'] < AUTH_READ) return $return;
  513. if (!empty($opts['pagesonly']) && !str_ends_with($file, '.txt')) return $return;
  514. if (empty($opts['showhidden']) && isHiddenPage($item['id'])) return $return;
  515. if (!empty($opts['filematch']) && !preg_match('/' . $opts['filematch'] . '/', $file)) return $return;
  516. if (!empty($opts['idmatch']) && !preg_match('/' . $opts['idmatch'] . '/', $item['id'])) return $return;
  517. }
  518. // still here? prepare the item
  519. $item['type'] = $type;
  520. $item['level'] = $lvl;
  521. $item['open'] = $return;
  522. if (!empty($opts['meta'])) {
  523. $item['file'] = PhpString::basename($file);
  524. $item['size'] = filesize($base . '/' . $file);
  525. $item['mtime'] = filemtime($base . '/' . $file);
  526. $item['rev'] = $item['mtime'];
  527. $item['writable'] = is_writable($base . '/' . $file);
  528. $item['executable'] = is_executable($base . '/' . $file);
  529. }
  530. if ($type == 'f') {
  531. if (!empty($opts['hash'])) $item['hash'] = md5(io_readFile($base . '/' . $file, false));
  532. if (!empty($opts['firsthead'])) $item['title'] = p_get_first_heading($item['id'], METADATA_DONT_RENDER);
  533. }
  534. // finally add the item
  535. $data[] = $item;
  536. return $return;
  537. }
  538. //Setup VIM: ex: et ts=4 :