[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Unicode.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Convert between UTF-8 and a list of Unicode Code Points
   7   */
   8  class Unicode
   9  {
  10  
  11      /**
  12       * Takes an UTF-8 string and returns an array of ints representing the
  13       * Unicode characters. Astral planes are supported ie. the ints in the
  14       * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  15       * are not allowed.
  16       *
  17       * If $strict is set to true the function returns false if the input
  18       * string isn't a valid UTF-8 octet sequence and raises a PHP error at
  19       * level E_USER_WARNING
  20       *
  21       * Note: this function has been modified slightly in this library to
  22       * trigger errors on encountering bad bytes
  23       *
  24       * @author <hsivonen@iki.fi>
  25       * @author Harry Fuecks <hfuecks@gmail.com>
  26       * @see    unicode_to_utf8
  27       * @link   http://hsivonen.iki.fi/php-utf8/
  28       * @link   http://sourceforge.net/projects/phputf8/
  29       * @todo break into less complex chunks
  30       * @todo use exceptions instead of user errors
  31       *
  32       * @param  string $str UTF-8 encoded string
  33       * @param  boolean $strict Check for invalid sequences?
  34       * @return mixed array of unicode code points or false if UTF-8 invalid
  35       */
  36      public static function fromUtf8($str, $strict = false)
  37      {
  38          $mState = 0;     // cached expected number of octets after the current octet
  39          // until the beginning of the next UTF8 character sequence
  40          $mUcs4 = 0;     // cached Unicode character
  41          $mBytes = 1;     // cached expected number of octets in the current sequence
  42  
  43          $out = array();
  44  
  45          $len = strlen($str);
  46  
  47          for ($i = 0; $i < $len; $i++) {
  48  
  49              $in = ord($str[$i]);
  50  
  51              if ($mState === 0) {
  52  
  53                  // When mState is zero we expect either a US-ASCII character or a
  54                  // multi-octet sequence.
  55                  if (0 === (0x80 & $in)) {
  56                      // US-ASCII, pass straight through.
  57                      $out[] = $in;
  58                      $mBytes = 1;
  59  
  60                  } else if (0xC0 === (0xE0 & $in)) {
  61                      // First octet of 2 octet sequence
  62                      $mUcs4 = $in;
  63                      $mUcs4 = ($mUcs4 & 0x1F) << 6;
  64                      $mState = 1;
  65                      $mBytes = 2;
  66  
  67                  } else if (0xE0 === (0xF0 & $in)) {
  68                      // First octet of 3 octet sequence
  69                      $mUcs4 = $in;
  70                      $mUcs4 = ($mUcs4 & 0x0F) << 12;
  71                      $mState = 2;
  72                      $mBytes = 3;
  73  
  74                  } else if (0xF0 === (0xF8 & $in)) {
  75                      // First octet of 4 octet sequence
  76                      $mUcs4 = $in;
  77                      $mUcs4 = ($mUcs4 & 0x07) << 18;
  78                      $mState = 3;
  79                      $mBytes = 4;
  80  
  81                  } else if (0xF8 === (0xFC & $in)) {
  82                      /* First octet of 5 octet sequence.
  83                       *
  84                       * This is illegal because the encoded codepoint must be either
  85                       * (a) not the shortest form or
  86                       * (b) outside the Unicode range of 0-0x10FFFF.
  87                       * Rather than trying to resynchronize, we will carry on until the end
  88                       * of the sequence and let the later error handling code catch it.
  89                       */
  90                      $mUcs4 = $in;
  91                      $mUcs4 = ($mUcs4 & 0x03) << 24;
  92                      $mState = 4;
  93                      $mBytes = 5;
  94  
  95                  } else if (0xFC === (0xFE & $in)) {
  96                      // First octet of 6 octet sequence, see comments for 5 octet sequence.
  97                      $mUcs4 = $in;
  98                      $mUcs4 = ($mUcs4 & 1) << 30;
  99                      $mState = 5;
 100                      $mBytes = 6;
 101  
 102                  } elseif ($strict) {
 103                      /* Current octet is neither in the US-ASCII range nor a legal first
 104                       * octet of a multi-octet sequence.
 105                       */
 106                      trigger_error(
 107                          'utf8_to_unicode: Illegal sequence identifier ' .
 108                          'in UTF-8 at byte ' . $i,
 109                          E_USER_WARNING
 110                      );
 111                      return false;
 112  
 113                  }
 114  
 115              } else {
 116  
 117                  // When mState is non-zero, we expect a continuation of the multi-octet
 118                  // sequence
 119                  if (0x80 === (0xC0 & $in)) {
 120  
 121                      // Legal continuation.
 122                      $shift = ($mState - 1) * 6;
 123                      $tmp = $in;
 124                      $tmp = ($tmp & 0x0000003F) << $shift;
 125                      $mUcs4 |= $tmp;
 126  
 127                      /**
 128                       * End of the multi-octet sequence. mUcs4 now contains the final
 129                       * Unicode codepoint to be output
 130                       */
 131                      if (0 === --$mState) {
 132  
 133                          /*
 134                           * Check for illegal sequences and codepoints.
 135                           */
 136                          // From Unicode 3.1, non-shortest form is illegal
 137                          if (((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
 138                              ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
 139                              ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
 140                              (4 < $mBytes) ||
 141                              // From Unicode 3.2, surrogate characters are illegal
 142                              (($mUcs4 & 0xFFFFF800) === 0xD800) ||
 143                              // Codepoints outside the Unicode range are illegal
 144                              ($mUcs4 > 0x10FFFF)) {
 145  
 146                              if ($strict) {
 147                                  trigger_error(
 148                                      'utf8_to_unicode: Illegal sequence or codepoint ' .
 149                                      'in UTF-8 at byte ' . $i,
 150                                      E_USER_WARNING
 151                                  );
 152  
 153                                  return false;
 154                              }
 155  
 156                          }
 157  
 158                          if (0xFEFF !== $mUcs4) {
 159                              // BOM is legal but we don't want to output it
 160                              $out[] = $mUcs4;
 161                          }
 162  
 163                          //initialize UTF8 cache
 164                          $mState = 0;
 165                          $mUcs4 = 0;
 166                          $mBytes = 1;
 167                      }
 168  
 169                  } elseif ($strict) {
 170                      /**
 171                       *((0xC0 & (*in) != 0x80) && (mState != 0))
 172                       * Incomplete multi-octet sequence.
 173                       */
 174                      trigger_error(
 175                          'utf8_to_unicode: Incomplete multi-octet ' .
 176                          '   sequence in UTF-8 at byte ' . $i,
 177                          E_USER_WARNING
 178                      );
 179  
 180                      return false;
 181                  }
 182              }
 183          }
 184          return $out;
 185      }
 186  
 187      /**
 188       * Takes an array of ints representing the Unicode characters and returns
 189       * a UTF-8 string. Astral planes are supported ie. the ints in the
 190       * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 191       * are not allowed.
 192       *
 193       * If $strict is set to true the function returns false if the input
 194       * array contains ints that represent surrogates or are outside the
 195       * Unicode range and raises a PHP error at level E_USER_WARNING
 196       *
 197       * Note: this function has been modified slightly in this library to use
 198       * output buffering to concatenate the UTF-8 string (faster) as well as
 199       * reference the array by it's keys
 200       *
 201       * @param  array $arr of unicode code points representing a string
 202       * @param  boolean $strict Check for invalid sequences?
 203       * @return string|false UTF-8 string or false if array contains invalid code points
 204       *
 205       * @author <hsivonen@iki.fi>
 206       * @author Harry Fuecks <hfuecks@gmail.com>
 207       * @see    utf8_to_unicode
 208       * @link   http://hsivonen.iki.fi/php-utf8/
 209       * @link   http://sourceforge.net/projects/phputf8/
 210       * @todo use exceptions instead of user errors
 211       */
 212      public static function toUtf8($arr, $strict = false)
 213      {
 214          if (!is_array($arr)) return '';
 215          ob_start();
 216  
 217          foreach (array_keys($arr) as $k) {
 218  
 219              if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
 220                  # ASCII range (including control chars)
 221  
 222                  echo chr($arr[$k]);
 223  
 224              } else if ($arr[$k] <= 0x07ff) {
 225                  # 2 byte sequence
 226  
 227                  echo chr(0xc0 | ($arr[$k] >> 6));
 228                  echo chr(0x80 | ($arr[$k] & 0x003f));
 229  
 230              } else if ($arr[$k] == 0xFEFF) {
 231                  # Byte order mark (skip)
 232                  // nop -- zap the BOM
 233  
 234              } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 235                  # Test for illegal surrogates
 236  
 237                  // found a surrogate
 238                  if ($strict) {
 239                      trigger_error(
 240                          'unicode_to_utf8: Illegal surrogate ' .
 241                          'at index: ' . $k . ', value: ' . $arr[$k],
 242                          E_USER_WARNING
 243                      );
 244                      return false;
 245                  }
 246  
 247              } else if ($arr[$k] <= 0xffff) {
 248                  # 3 byte sequence
 249  
 250                  echo chr(0xe0 | ($arr[$k] >> 12));
 251                  echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 252                  echo chr(0x80 | ($arr[$k] & 0x003f));
 253  
 254              } else if ($arr[$k] <= 0x10ffff) {
 255                  # 4 byte sequence
 256  
 257                  echo chr(0xf0 | ($arr[$k] >> 18));
 258                  echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 259                  echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 260                  echo chr(0x80 | ($arr[$k] & 0x3f));
 261  
 262              } elseif ($strict) {
 263  
 264                  trigger_error(
 265                      'unicode_to_utf8: Codepoint out of Unicode range ' .
 266                      'at index: ' . $k . ', value: ' . $arr[$k],
 267                      E_USER_WARNING
 268                  );
 269  
 270                  // out of range
 271                  return false;
 272              }
 273          }
 274  
 275          return ob_get_clean();
 276      }
 277  }