You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

350 lines
11 KiB

  1. <?php
  2. /**
  3. * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
  4. * For an intro to the Lexer see:
  5. * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
  6. *
  7. * @author Marcus Baker http://www.lastcraft.com
  8. */
  9. namespace dokuwiki\Parsing\Lexer;
  10. /**
  11. * Accepts text and breaks it into tokens.
  12. *
  13. * Some optimisation to make the sure the content is only scanned by the PHP regex
  14. * parser once. Lexer modes must not start with leading underscores.
  15. */
  16. class Lexer
  17. {
  18. /** @var ParallelRegex[] */
  19. protected $regexes = [];
  20. /** @var \Doku_Handler */
  21. protected $handler;
  22. /** @var StateStack */
  23. protected $modeStack;
  24. /** @var array mode "rewrites" */
  25. protected $mode_handlers = [];
  26. /** @var bool case sensitive? */
  27. protected $case;
  28. /**
  29. * Sets up the lexer in case insensitive matching by default.
  30. *
  31. * @param \Doku_Handler $handler Handling strategy by reference.
  32. * @param string $start Starting handler.
  33. * @param boolean $case True for case sensitive.
  34. */
  35. public function __construct($handler, $start = "accept", $case = false)
  36. {
  37. $this->case = $case;
  38. $this->handler = $handler;
  39. $this->modeStack = new StateStack($start);
  40. }
  41. /**
  42. * Adds a token search pattern for a particular parsing mode.
  43. *
  44. * The pattern does not change the current mode.
  45. *
  46. * @param string $pattern Perl style regex, but ( and )
  47. * lose the usual meaning.
  48. * @param string $mode Should only apply this
  49. * pattern when dealing with
  50. * this type of input.
  51. */
  52. public function addPattern($pattern, $mode = "accept")
  53. {
  54. if (! isset($this->regexes[$mode])) {
  55. $this->regexes[$mode] = new ParallelRegex($this->case);
  56. }
  57. $this->regexes[$mode]->addPattern($pattern);
  58. }
  59. /**
  60. * Adds a pattern that will enter a new parsing mode.
  61. *
  62. * Useful for entering parenthesis, strings, tags, etc.
  63. *
  64. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  65. * @param string $mode Should only apply this pattern when dealing with this type of input.
  66. * @param string $new_mode Change parsing to this new nested mode.
  67. */
  68. public function addEntryPattern($pattern, $mode, $new_mode)
  69. {
  70. if (! isset($this->regexes[$mode])) {
  71. $this->regexes[$mode] = new ParallelRegex($this->case);
  72. }
  73. $this->regexes[$mode]->addPattern($pattern, $new_mode);
  74. }
  75. /**
  76. * Adds a pattern that will exit the current mode and re-enter the previous one.
  77. *
  78. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  79. * @param string $mode Mode to leave.
  80. */
  81. public function addExitPattern($pattern, $mode)
  82. {
  83. if (! isset($this->regexes[$mode])) {
  84. $this->regexes[$mode] = new ParallelRegex($this->case);
  85. }
  86. $this->regexes[$mode]->addPattern($pattern, "__exit");
  87. }
  88. /**
  89. * Adds a pattern that has a special mode.
  90. *
  91. * Acts as an entry and exit pattern in one go, effectively calling a special
  92. * parser handler for this token only.
  93. *
  94. * @param string $pattern Perl style regex, but ( and ) lose the usual meaning.
  95. * @param string $mode Should only apply this pattern when dealing with this type of input.
  96. * @param string $special Use this mode for this one token.
  97. */
  98. public function addSpecialPattern($pattern, $mode, $special)
  99. {
  100. if (! isset($this->regexes[$mode])) {
  101. $this->regexes[$mode] = new ParallelRegex($this->case);
  102. }
  103. $this->regexes[$mode]->addPattern($pattern, "_$special");
  104. }
  105. /**
  106. * Adds a mapping from a mode to another handler.
  107. *
  108. * @param string $mode Mode to be remapped.
  109. * @param string $handler New target handler.
  110. */
  111. public function mapHandler($mode, $handler)
  112. {
  113. $this->mode_handlers[$mode] = $handler;
  114. }
  115. /**
  116. * Splits the page text into tokens.
  117. *
  118. * Will fail if the handlers report an error or if no content is consumed. If successful then each
  119. * unparsed and parsed token invokes a call to the held listener.
  120. *
  121. * @param string $raw Raw HTML text.
  122. * @return boolean True on success, else false.
  123. */
  124. public function parse($raw)
  125. {
  126. if (! isset($this->handler)) {
  127. return false;
  128. }
  129. $initialLength = strlen($raw);
  130. $length = $initialLength;
  131. $pos = 0;
  132. while (is_array($parsed = $this->reduce($raw))) {
  133. [$unmatched, $matched, $mode] = $parsed;
  134. $currentLength = strlen($raw);
  135. $matchPos = $initialLength - $currentLength - strlen($matched);
  136. if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
  137. return false;
  138. }
  139. if ($currentLength === $length) {
  140. return false;
  141. }
  142. $length = $currentLength;
  143. $pos = $initialLength - $currentLength;
  144. }
  145. if (!$parsed) {
  146. return false;
  147. }
  148. return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
  149. }
  150. /**
  151. * Gives plugins access to the mode stack
  152. *
  153. * @return StateStack
  154. */
  155. public function getModeStack()
  156. {
  157. return $this->modeStack;
  158. }
  159. /**
  160. * Sends the matched token and any leading unmatched
  161. * text to the parser changing the lexer to a new
  162. * mode if one is listed.
  163. *
  164. * @param string $unmatched Unmatched leading portion.
  165. * @param string $matched Actual token match.
  166. * @param bool|string $mode Mode after match. A boolean false mode causes no change.
  167. * @param int $initialPos
  168. * @param int $matchPos Current byte index location in raw doc thats being parsed
  169. * @return boolean False if there was any error from the parser.
  170. */
  171. protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
  172. {
  173. if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
  174. return false;
  175. }
  176. if ($this->isModeEnd($mode)) {
  177. if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
  178. return false;
  179. }
  180. return $this->modeStack->leave();
  181. }
  182. if ($this->isSpecialMode($mode)) {
  183. $this->modeStack->enter($this->decodeSpecial($mode));
  184. if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
  185. return false;
  186. }
  187. return $this->modeStack->leave();
  188. }
  189. if (is_string($mode)) {
  190. $this->modeStack->enter($mode);
  191. return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
  192. }
  193. return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
  194. }
  195. /**
  196. * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
  197. * mode stack.
  198. *
  199. * @param string $mode Mode to test.
  200. * @return boolean True if this is the exit mode.
  201. */
  202. protected function isModeEnd($mode)
  203. {
  204. return ($mode === "__exit");
  205. }
  206. /**
  207. * Test to see if the mode is one where this mode is entered for this token only and automatically
  208. * leaves immediately afterwoods.
  209. *
  210. * @param string $mode Mode to test.
  211. * @return boolean True if this is the exit mode.
  212. */
  213. protected function isSpecialMode($mode)
  214. {
  215. return str_starts_with($mode, '_');
  216. }
  217. /**
  218. * Strips the magic underscore marking single token modes.
  219. *
  220. * @param string $mode Mode to decode.
  221. * @return string Underlying mode name.
  222. */
  223. protected function decodeSpecial($mode)
  224. {
  225. return substr($mode, 1);
  226. }
  227. /**
  228. * Calls the parser method named after the current mode.
  229. *
  230. * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
  231. *
  232. * @param string $content Text parsed.
  233. * @param boolean $is_match Token is recognised rather
  234. * than unparsed data.
  235. * @param int $pos Current byte index location in raw doc
  236. * thats being parsed
  237. * @return bool
  238. */
  239. protected function invokeHandler($content, $is_match, $pos)
  240. {
  241. if (($content === "") || ($content === false)) {
  242. return true;
  243. }
  244. $handler = $this->modeStack->getCurrent();
  245. if (isset($this->mode_handlers[$handler])) {
  246. $handler = $this->mode_handlers[$handler];
  247. }
  248. // modes starting with plugin_ are all handled by the same
  249. // handler but with an additional parameter
  250. if (str_starts_with($handler, 'plugin_')) {
  251. [$handler, $plugin] = sexplode('_', $handler, 2, '');
  252. return $this->handler->$handler($content, $is_match, $pos, $plugin);
  253. }
  254. return $this->handler->$handler($content, $is_match, $pos);
  255. }
  256. /**
  257. * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
  258. * unparsed data. Empty strings will not be matched.
  259. *
  260. * @param string $raw The subject to parse. This is the content that will be eaten.
  261. * @return array|bool Three item list of unparsed content followed by the
  262. * recognised token and finally the action the parser is to take.
  263. * True if no match, false if there is a parsing error.
  264. */
  265. protected function reduce(&$raw)
  266. {
  267. if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
  268. return false;
  269. }
  270. if ($raw === "") {
  271. return true;
  272. }
  273. if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
  274. [$unparsed, $match, $raw] = $split;
  275. return [$unparsed, $match, $action];
  276. }
  277. return true;
  278. }
  279. /**
  280. * Escapes regex characters other than (, ) and /
  281. *
  282. * @param string $str
  283. * @return string
  284. */
  285. public static function escape($str)
  286. {
  287. $chars = [
  288. '/\\\\/',
  289. '/\./',
  290. '/\+/',
  291. '/\*/',
  292. '/\?/',
  293. '/\[/',
  294. '/\^/',
  295. '/\]/',
  296. '/\$/',
  297. '/\{/',
  298. '/\}/',
  299. '/\=/',
  300. '/\!/',
  301. '/\</',
  302. '/\>/',
  303. '/\|/',
  304. '/\:/'
  305. ];
  306. $escaped = [
  307. '\\\\\\\\',
  308. '\.',
  309. '\+',
  310. '\*',
  311. '\?',
  312. '\[',
  313. '\^',
  314. '\]',
  315. '\$',
  316. '\{',
  317. '\}',
  318. '\=',
  319. '\!',
  320. '\<',
  321. '\>',
  322. '\|',
  323. '\:'
  324. ];
  325. return preg_replace($chars, $escaped, $str);
  326. }
  327. }