[ Index ] |
PHP Cross Reference of DokuWiki |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace dokuwiki\Utf8; 4 5 /** 6 * Convert between UTF-8 and a list of Unicode Code Points 7 */ 8 class Unicode 9 { 10 11 /** 12 * Takes an UTF-8 string and returns an array of ints representing the 13 * Unicode characters. Astral planes are supported ie. the ints in the 14 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 15 * are not allowed. 16 * 17 * If $strict is set to true the function returns false if the input 18 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 19 * level E_USER_WARNING 20 * 21 * Note: this function has been modified slightly in this library to 22 * trigger errors on encountering bad bytes 23 * 24 * @author <hsivonen@iki.fi> 25 * @author Harry Fuecks <hfuecks@gmail.com> 26 * @see unicode_to_utf8 27 * @link http://hsivonen.iki.fi/php-utf8/ 28 * @link http://sourceforge.net/projects/phputf8/ 29 * @todo break into less complex chunks 30 * @todo use exceptions instead of user errors 31 * 32 * @param string $str UTF-8 encoded string 33 * @param boolean $strict Check for invalid sequences? 34 * @return mixed array of unicode code points or false if UTF-8 invalid 35 */ 36 public static function fromUtf8($str, $strict = false) 37 { 38 $mState = 0; // cached expected number of octets after the current octet 39 // until the beginning of the next UTF8 character sequence 40 $mUcs4 = 0; // cached Unicode character 41 $mBytes = 1; // cached expected number of octets in the current sequence 42 43 $out = array(); 44 45 $len = strlen($str); 46 47 for ($i = 0; $i < $len; $i++) { 48 49 $in = ord($str[$i]); 50 51 if ($mState === 0) { 52 53 // When mState is zero we expect either a US-ASCII character or a 54 // multi-octet sequence. 55 if (0 === (0x80 & $in)) { 56 // US-ASCII, pass straight through. 57 $out[] = $in; 58 $mBytes = 1; 59 60 } else if (0xC0 === (0xE0 & $in)) { 61 // First octet of 2 octet sequence 62 $mUcs4 = $in; 63 $mUcs4 = ($mUcs4 & 0x1F) << 6; 64 $mState = 1; 65 $mBytes = 2; 66 67 } else if (0xE0 === (0xF0 & $in)) { 68 // First octet of 3 octet sequence 69 $mUcs4 = $in; 70 $mUcs4 = ($mUcs4 & 0x0F) << 12; 71 $mState = 2; 72 $mBytes = 3; 73 74 } else if (0xF0 === (0xF8 & $in)) { 75 // First octet of 4 octet sequence 76 $mUcs4 = $in; 77 $mUcs4 = ($mUcs4 & 0x07) << 18; 78 $mState = 3; 79 $mBytes = 4; 80 81 } else if (0xF8 === (0xFC & $in)) { 82 /* First octet of 5 octet sequence. 83 * 84 * This is illegal because the encoded codepoint must be either 85 * (a) not the shortest form or 86 * (b) outside the Unicode range of 0-0x10FFFF. 87 * Rather than trying to resynchronize, we will carry on until the end 88 * of the sequence and let the later error handling code catch it. 89 */ 90 $mUcs4 = $in; 91 $mUcs4 = ($mUcs4 & 0x03) << 24; 92 $mState = 4; 93 $mBytes = 5; 94 95 } else if (0xFC === (0xFE & $in)) { 96 // First octet of 6 octet sequence, see comments for 5 octet sequence. 97 $mUcs4 = $in; 98 $mUcs4 = ($mUcs4 & 1) << 30; 99 $mState = 5; 100 $mBytes = 6; 101 102 } elseif ($strict) { 103 /* Current octet is neither in the US-ASCII range nor a legal first 104 * octet of a multi-octet sequence. 105 */ 106 trigger_error( 107 'utf8_to_unicode: Illegal sequence identifier ' . 108 'in UTF-8 at byte ' . $i, 109 E_USER_WARNING 110 ); 111 return false; 112 113 } 114 115 } else { 116 117 // When mState is non-zero, we expect a continuation of the multi-octet 118 // sequence 119 if (0x80 === (0xC0 & $in)) { 120 121 // Legal continuation. 122 $shift = ($mState - 1) * 6; 123 $tmp = $in; 124 $tmp = ($tmp & 0x0000003F) << $shift; 125 $mUcs4 |= $tmp; 126 127 /** 128 * End of the multi-octet sequence. mUcs4 now contains the final 129 * Unicode codepoint to be output 130 */ 131 if (0 === --$mState) { 132 133 /* 134 * Check for illegal sequences and codepoints. 135 */ 136 // From Unicode 3.1, non-shortest form is illegal 137 if (((2 === $mBytes) && ($mUcs4 < 0x0080)) || 138 ((3 === $mBytes) && ($mUcs4 < 0x0800)) || 139 ((4 === $mBytes) && ($mUcs4 < 0x10000)) || 140 (4 < $mBytes) || 141 // From Unicode 3.2, surrogate characters are illegal 142 (($mUcs4 & 0xFFFFF800) === 0xD800) || 143 // Codepoints outside the Unicode range are illegal 144 ($mUcs4 > 0x10FFFF)) { 145 146 if ($strict) { 147 trigger_error( 148 'utf8_to_unicode: Illegal sequence or codepoint ' . 149 'in UTF-8 at byte ' . $i, 150 E_USER_WARNING 151 ); 152 153 return false; 154 } 155 156 } 157 158 if (0xFEFF !== $mUcs4) { 159 // BOM is legal but we don't want to output it 160 $out[] = $mUcs4; 161 } 162 163 //initialize UTF8 cache 164 $mState = 0; 165 $mUcs4 = 0; 166 $mBytes = 1; 167 } 168 169 } elseif ($strict) { 170 /** 171 *((0xC0 & (*in) != 0x80) && (mState != 0)) 172 * Incomplete multi-octet sequence. 173 */ 174 trigger_error( 175 'utf8_to_unicode: Incomplete multi-octet ' . 176 ' sequence in UTF-8 at byte ' . $i, 177 E_USER_WARNING 178 ); 179 180 return false; 181 } 182 } 183 } 184 return $out; 185 } 186 187 /** 188 * Takes an array of ints representing the Unicode characters and returns 189 * a UTF-8 string. Astral planes are supported ie. the ints in the 190 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 191 * are not allowed. 192 * 193 * If $strict is set to true the function returns false if the input 194 * array contains ints that represent surrogates or are outside the 195 * Unicode range and raises a PHP error at level E_USER_WARNING 196 * 197 * Note: this function has been modified slightly in this library to use 198 * output buffering to concatenate the UTF-8 string (faster) as well as 199 * reference the array by it's keys 200 * 201 * @param array $arr of unicode code points representing a string 202 * @param boolean $strict Check for invalid sequences? 203 * @return string|false UTF-8 string or false if array contains invalid code points 204 * 205 * @author <hsivonen@iki.fi> 206 * @author Harry Fuecks <hfuecks@gmail.com> 207 * @see utf8_to_unicode 208 * @link http://hsivonen.iki.fi/php-utf8/ 209 * @link http://sourceforge.net/projects/phputf8/ 210 * @todo use exceptions instead of user errors 211 */ 212 public static function toUtf8($arr, $strict = false) 213 { 214 if (!is_array($arr)) return ''; 215 ob_start(); 216 217 foreach (array_keys($arr) as $k) { 218 219 if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) { 220 # ASCII range (including control chars) 221 222 echo chr($arr[$k]); 223 224 } else if ($arr[$k] <= 0x07ff) { 225 # 2 byte sequence 226 227 echo chr(0xc0 | ($arr[$k] >> 6)); 228 echo chr(0x80 | ($arr[$k] & 0x003f)); 229 230 } else if ($arr[$k] == 0xFEFF) { 231 # Byte order mark (skip) 232 // nop -- zap the BOM 233 234 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 235 # Test for illegal surrogates 236 237 // found a surrogate 238 if ($strict) { 239 trigger_error( 240 'unicode_to_utf8: Illegal surrogate ' . 241 'at index: ' . $k . ', value: ' . $arr[$k], 242 E_USER_WARNING 243 ); 244 return false; 245 } 246 247 } else if ($arr[$k] <= 0xffff) { 248 # 3 byte sequence 249 250 echo chr(0xe0 | ($arr[$k] >> 12)); 251 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 252 echo chr(0x80 | ($arr[$k] & 0x003f)); 253 254 } else if ($arr[$k] <= 0x10ffff) { 255 # 4 byte sequence 256 257 echo chr(0xf0 | ($arr[$k] >> 18)); 258 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 259 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 260 echo chr(0x80 | ($arr[$k] & 0x3f)); 261 262 } elseif ($strict) { 263 264 trigger_error( 265 'unicode_to_utf8: Codepoint out of Unicode range ' . 266 'at index: ' . $k . ', value: ' . $arr[$k], 267 E_USER_WARNING 268 ); 269 270 // out of range 271 return false; 272 } 273 } 274 275 return ob_get_clean(); 276 } 277 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body