You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

247 lines
9.7 KiB

  1. <?php
  2. namespace dokuwiki\Utf8;
  3. /**
  4. * Convert between UTF-8 and a list of Unicode Code Points
  5. */
  6. class Unicode
  7. {
  8. /**
  9. * Takes an UTF-8 string and returns an array of ints representing the
  10. * Unicode characters. Astral planes are supported ie. the ints in the
  11. * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  12. * are not allowed.
  13. *
  14. * If $strict is set to true the function returns false if the input
  15. * string isn't a valid UTF-8 octet sequence and raises a PHP error at
  16. * level E_USER_WARNING
  17. *
  18. * Note: this function has been modified slightly in this library to
  19. * trigger errors on encountering bad bytes
  20. *
  21. * @author <hsivonen@iki.fi>
  22. * @author Harry Fuecks <hfuecks@gmail.com>
  23. * @see unicode_to_utf8
  24. * @link http://hsivonen.iki.fi/php-utf8/
  25. * @link http://sourceforge.net/projects/phputf8/
  26. * @todo break into less complex chunks
  27. * @todo use exceptions instead of user errors
  28. *
  29. * @param string $str UTF-8 encoded string
  30. * @param boolean $strict Check for invalid sequences?
  31. * @return mixed array of unicode code points or false if UTF-8 invalid
  32. */
  33. public static function fromUtf8($str, $strict = false)
  34. {
  35. $mState = 0; // cached expected number of octets after the current octet
  36. // until the beginning of the next UTF8 character sequence
  37. $mUcs4 = 0; // cached Unicode character
  38. $mBytes = 1; // cached expected number of octets in the current sequence
  39. $out = [];
  40. $len = strlen($str);
  41. for ($i = 0; $i < $len; $i++) {
  42. $in = ord($str[$i]);
  43. if ($mState === 0) {
  44. // When mState is zero we expect either a US-ASCII character or a
  45. // multi-octet sequence.
  46. if (0 === (0x80 & $in)) {
  47. // US-ASCII, pass straight through.
  48. $out[] = $in;
  49. $mBytes = 1;
  50. } elseif (0xC0 === (0xE0 & $in)) {
  51. // First octet of 2 octet sequence
  52. $mUcs4 = $in;
  53. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  54. $mState = 1;
  55. $mBytes = 2;
  56. } elseif (0xE0 === (0xF0 & $in)) {
  57. // First octet of 3 octet sequence
  58. $mUcs4 = $in;
  59. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  60. $mState = 2;
  61. $mBytes = 3;
  62. } elseif (0xF0 === (0xF8 & $in)) {
  63. // First octet of 4 octet sequence
  64. $mUcs4 = $in;
  65. $mUcs4 = ($mUcs4 & 0x07) << 18;
  66. $mState = 3;
  67. $mBytes = 4;
  68. } elseif (0xF8 === (0xFC & $in)) {
  69. /* First octet of 5 octet sequence.
  70. *
  71. * This is illegal because the encoded codepoint must be either
  72. * (a) not the shortest form or
  73. * (b) outside the Unicode range of 0-0x10FFFF.
  74. * Rather than trying to resynchronize, we will carry on until the end
  75. * of the sequence and let the later error handling code catch it.
  76. */
  77. $mUcs4 = $in;
  78. $mUcs4 = ($mUcs4 & 0x03) << 24;
  79. $mState = 4;
  80. $mBytes = 5;
  81. } elseif (0xFC === (0xFE & $in)) {
  82. // First octet of 6 octet sequence, see comments for 5 octet sequence.
  83. $mUcs4 = $in;
  84. $mUcs4 = ($mUcs4 & 1) << 30;
  85. $mState = 5;
  86. $mBytes = 6;
  87. } elseif ($strict) {
  88. /* Current octet is neither in the US-ASCII range nor a legal first
  89. * octet of a multi-octet sequence.
  90. */
  91. trigger_error(
  92. 'utf8_to_unicode: Illegal sequence identifier ' .
  93. 'in UTF-8 at byte ' . $i,
  94. E_USER_WARNING
  95. );
  96. return false;
  97. }
  98. } elseif (0x80 === (0xC0 & $in)) {
  99. // When mState is non-zero, we expect a continuation of the multi-octet
  100. // sequence
  101. // Legal continuation.
  102. $shift = ($mState - 1) * 6;
  103. $tmp = $in;
  104. $tmp = ($tmp & 0x0000003F) << $shift;
  105. $mUcs4 |= $tmp;
  106. /**
  107. * End of the multi-octet sequence. mUcs4 now contains the final
  108. * Unicode codepoint to be output
  109. */
  110. if (0 === --$mState) {
  111. /*
  112. * Check for illegal sequences and codepoints.
  113. */
  114. // From Unicode 3.1, non-shortest form is illegal
  115. if (
  116. ((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
  117. ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
  118. ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
  119. (4 < $mBytes) ||
  120. // From Unicode 3.2, surrogate characters are illegal
  121. (($mUcs4 & 0xFFFFF800) === 0xD800) ||
  122. // Codepoints outside the Unicode range are illegal
  123. ($mUcs4 > 0x10FFFF)
  124. ) {
  125. if ($strict) {
  126. trigger_error(
  127. 'utf8_to_unicode: Illegal sequence or codepoint ' .
  128. 'in UTF-8 at byte ' . $i,
  129. E_USER_WARNING
  130. );
  131. return false;
  132. }
  133. }
  134. if (0xFEFF !== $mUcs4) {
  135. // BOM is legal but we don't want to output it
  136. $out[] = $mUcs4;
  137. }
  138. //initialize UTF8 cache
  139. $mState = 0;
  140. $mUcs4 = 0;
  141. $mBytes = 1;
  142. }
  143. } elseif ($strict) {
  144. /**
  145. *((0xC0 & (*in) != 0x80) && (mState != 0))
  146. * Incomplete multi-octet sequence.
  147. */
  148. trigger_error(
  149. 'utf8_to_unicode: Incomplete multi-octet ' .
  150. ' sequence in UTF-8 at byte ' . $i,
  151. E_USER_WARNING
  152. );
  153. return false;
  154. }
  155. }
  156. return $out;
  157. }
  158. /**
  159. * Takes an array of ints representing the Unicode characters and returns
  160. * a UTF-8 string. Astral planes are supported ie. the ints in the
  161. * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  162. * are not allowed.
  163. *
  164. * If $strict is set to true the function returns false if the input
  165. * array contains ints that represent surrogates or are outside the
  166. * Unicode range and raises a PHP error at level E_USER_WARNING
  167. *
  168. * Note: this function has been modified slightly in this library to use
  169. * output buffering to concatenate the UTF-8 string (faster) as well as
  170. * reference the array by it's keys
  171. *
  172. * @param array $arr of unicode code points representing a string
  173. * @param boolean $strict Check for invalid sequences?
  174. * @return string|false UTF-8 string or false if array contains invalid code points
  175. *
  176. * @author <hsivonen@iki.fi>
  177. * @author Harry Fuecks <hfuecks@gmail.com>
  178. * @see utf8_to_unicode
  179. * @link http://hsivonen.iki.fi/php-utf8/
  180. * @link http://sourceforge.net/projects/phputf8/
  181. * @todo use exceptions instead of user errors
  182. */
  183. public static function toUtf8($arr, $strict = false)
  184. {
  185. if (!is_array($arr)) return '';
  186. ob_start();
  187. foreach (array_keys($arr) as $k) {
  188. if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
  189. # ASCII range (including control chars)
  190. echo chr($arr[$k]);
  191. } elseif ($arr[$k] <= 0x07ff) {
  192. # 2 byte sequence
  193. echo chr(0xc0 | ($arr[$k] >> 6));
  194. echo chr(0x80 | ($arr[$k] & 0x003f));
  195. } elseif ($arr[$k] == 0xFEFF) {
  196. # Byte order mark (skip)
  197. // nop -- zap the BOM
  198. } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
  199. # Test for illegal surrogates
  200. // found a surrogate
  201. if ($strict) {
  202. trigger_error(
  203. 'unicode_to_utf8: Illegal surrogate ' .
  204. 'at index: ' . $k . ', value: ' . $arr[$k],
  205. E_USER_WARNING
  206. );
  207. return false;
  208. }
  209. } elseif ($arr[$k] <= 0xffff) {
  210. # 3 byte sequence
  211. echo chr(0xe0 | ($arr[$k] >> 12));
  212. echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
  213. echo chr(0x80 | ($arr[$k] & 0x003f));
  214. } elseif ($arr[$k] <= 0x10ffff) {
  215. # 4 byte sequence
  216. echo chr(0xf0 | ($arr[$k] >> 18));
  217. echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
  218. echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
  219. echo chr(0x80 | ($arr[$k] & 0x3f));
  220. } elseif ($strict) {
  221. trigger_error(
  222. 'unicode_to_utf8: Codepoint out of Unicode range ' .
  223. 'at index: ' . $k . ', value: ' . $arr[$k],
  224. E_USER_WARNING
  225. );
  226. // out of range
  227. return false;
  228. }
  229. }
  230. return ob_get_clean();
  231. }
  232. }