[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Unicode.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Convert between UTF-8 and a list of Unicode Code Points
   7   */
   8  class Unicode
   9  {
  10      /**
  11       * Takes an UTF-8 string and returns an array of ints representing the
  12       * Unicode characters. Astral planes are supported ie. the ints in the
  13       * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  14       * are not allowed.
  15       *
  16       * If $strict is set to true the function returns false if the input
  17       * string isn't a valid UTF-8 octet sequence and raises a PHP error at
  18       * level E_USER_WARNING
  19       *
  20       * Note: this function has been modified slightly in this library to
  21       * trigger errors on encountering bad bytes
  22       *
  23       * @author <hsivonen@iki.fi>
  24       * @author Harry Fuecks <hfuecks@gmail.com>
  25       * @see    unicode_to_utf8
  26       * @link   http://hsivonen.iki.fi/php-utf8/
  27       * @link   http://sourceforge.net/projects/phputf8/
  28       * @todo break into less complex chunks
  29       * @todo use exceptions instead of user errors
  30       *
  31       * @param  string $str UTF-8 encoded string
  32       * @param  boolean $strict Check for invalid sequences?
  33       * @return mixed array of unicode code points or false if UTF-8 invalid
  34       */
  35      public static function fromUtf8($str, $strict = false)
  36      {
  37          $mState = 0;     // cached expected number of octets after the current octet
  38          // until the beginning of the next UTF8 character sequence
  39          $mUcs4 = 0;     // cached Unicode character
  40          $mBytes = 1;     // cached expected number of octets in the current sequence
  41  
  42          $out = [];
  43  
  44          $len = strlen($str);
  45  
  46          for ($i = 0; $i < $len; $i++) {
  47              $in = ord($str[$i]);
  48  
  49              if ($mState === 0) {
  50                  // When mState is zero we expect either a US-ASCII character or a
  51                  // multi-octet sequence.
  52                  if (0 === (0x80 & $in)) {
  53                      // US-ASCII, pass straight through.
  54                      $out[] = $in;
  55                      $mBytes = 1;
  56                  } elseif (0xC0 === (0xE0 & $in)) {
  57                      // First octet of 2 octet sequence
  58                      $mUcs4 = $in;
  59                      $mUcs4 = ($mUcs4 & 0x1F) << 6;
  60                      $mState = 1;
  61                      $mBytes = 2;
  62                  } elseif (0xE0 === (0xF0 & $in)) {
  63                      // First octet of 3 octet sequence
  64                      $mUcs4 = $in;
  65                      $mUcs4 = ($mUcs4 & 0x0F) << 12;
  66                      $mState = 2;
  67                      $mBytes = 3;
  68                  } elseif (0xF0 === (0xF8 & $in)) {
  69                      // First octet of 4 octet sequence
  70                      $mUcs4 = $in;
  71                      $mUcs4 = ($mUcs4 & 0x07) << 18;
  72                      $mState = 3;
  73                      $mBytes = 4;
  74                  } elseif (0xF8 === (0xFC & $in)) {
  75                      /* First octet of 5 octet sequence.
  76                       *
  77                       * This is illegal because the encoded codepoint must be either
  78                       * (a) not the shortest form or
  79                       * (b) outside the Unicode range of 0-0x10FFFF.
  80                       * Rather than trying to resynchronize, we will carry on until the end
  81                       * of the sequence and let the later error handling code catch it.
  82                       */
  83                      $mUcs4 = $in;
  84                      $mUcs4 = ($mUcs4 & 0x03) << 24;
  85                      $mState = 4;
  86                      $mBytes = 5;
  87                  } elseif (0xFC === (0xFE & $in)) {
  88                      // First octet of 6 octet sequence, see comments for 5 octet sequence.
  89                      $mUcs4 = $in;
  90                      $mUcs4 = ($mUcs4 & 1) << 30;
  91                      $mState = 5;
  92                      $mBytes = 6;
  93                  } elseif ($strict) {
  94                      /* Current octet is neither in the US-ASCII range nor a legal first
  95                       * octet of a multi-octet sequence.
  96                       */
  97                      trigger_error(
  98                          'utf8_to_unicode: Illegal sequence identifier ' .
  99                          'in UTF-8 at byte ' . $i,
 100                          E_USER_WARNING
 101                      );
 102                      return false;
 103                  }
 104              } elseif (0x80 === (0xC0 & $in)) {
 105                  // When mState is non-zero, we expect a continuation of the multi-octet
 106                  // sequence
 107                  // Legal continuation.
 108                  $shift = ($mState - 1) * 6;
 109                  $tmp = $in;
 110                  $tmp = ($tmp & 0x0000003F) << $shift;
 111                  $mUcs4 |= $tmp;
 112                  /**
 113                   * End of the multi-octet sequence. mUcs4 now contains the final
 114                   * Unicode codepoint to be output
 115                   */
 116                  if (0 === --$mState) {
 117                      /*
 118                       * Check for illegal sequences and codepoints.
 119                       */
 120                      // From Unicode 3.1, non-shortest form is illegal
 121                      if (
 122                          ((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
 123                          ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
 124                          ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
 125                          (4 < $mBytes) ||
 126                          // From Unicode 3.2, surrogate characters are illegal
 127                          (($mUcs4 & 0xFFFFF800) === 0xD800) ||
 128                          // Codepoints outside the Unicode range are illegal
 129                          ($mUcs4 > 0x10FFFF)
 130                      ) {
 131                          if ($strict) {
 132                              trigger_error(
 133                                  'utf8_to_unicode: Illegal sequence or codepoint ' .
 134                                  'in UTF-8 at byte ' . $i,
 135                                  E_USER_WARNING
 136                              );
 137  
 138                              return false;
 139                          }
 140                      }
 141  
 142                      if (0xFEFF !== $mUcs4) {
 143                          // BOM is legal but we don't want to output it
 144                          $out[] = $mUcs4;
 145                      }
 146  
 147                      //initialize UTF8 cache
 148                      $mState = 0;
 149                      $mUcs4 = 0;
 150                      $mBytes = 1;
 151                  }
 152              } elseif ($strict) {
 153                  /**
 154                   *((0xC0 & (*in) != 0x80) && (mState != 0))
 155                   * Incomplete multi-octet sequence.
 156                   */
 157                  trigger_error(
 158                      'utf8_to_unicode: Incomplete multi-octet ' .
 159                      '   sequence in UTF-8 at byte ' . $i,
 160                      E_USER_WARNING
 161                  );
 162  
 163                  return false;
 164              }
 165          }
 166          return $out;
 167      }
 168  
 169      /**
 170       * Takes an array of ints representing the Unicode characters and returns
 171       * a UTF-8 string. Astral planes are supported ie. the ints in the
 172       * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 173       * are not allowed.
 174       *
 175       * If $strict is set to true the function returns false if the input
 176       * array contains ints that represent surrogates or are outside the
 177       * Unicode range and raises a PHP error at level E_USER_WARNING
 178       *
 179       * Note: this function has been modified slightly in this library to use
 180       * output buffering to concatenate the UTF-8 string (faster) as well as
 181       * reference the array by it's keys
 182       *
 183       * @param  array $arr of unicode code points representing a string
 184       * @param  boolean $strict Check for invalid sequences?
 185       * @return string|false UTF-8 string or false if array contains invalid code points
 186       *
 187       * @author <hsivonen@iki.fi>
 188       * @author Harry Fuecks <hfuecks@gmail.com>
 189       * @see    utf8_to_unicode
 190       * @link   http://hsivonen.iki.fi/php-utf8/
 191       * @link   http://sourceforge.net/projects/phputf8/
 192       * @todo use exceptions instead of user errors
 193       */
 194      public static function toUtf8($arr, $strict = false)
 195      {
 196          if (!is_array($arr)) return '';
 197          ob_start();
 198  
 199          foreach (array_keys($arr) as $k) {
 200              if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
 201                  # ASCII range (including control chars)
 202                  echo chr($arr[$k]);
 203              } elseif ($arr[$k] <= 0x07ff) {
 204                  # 2 byte sequence
 205                  echo chr(0xc0 | ($arr[$k] >> 6));
 206                  echo chr(0x80 | ($arr[$k] & 0x003f));
 207              } elseif ($arr[$k] == 0xFEFF) {
 208                  # Byte order mark (skip)
 209                  // nop -- zap the BOM
 210              } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 211                  # Test for illegal surrogates
 212                  // found a surrogate
 213                  if ($strict) {
 214                      trigger_error(
 215                          'unicode_to_utf8: Illegal surrogate ' .
 216                          'at index: ' . $k . ', value: ' . $arr[$k],
 217                          E_USER_WARNING
 218                      );
 219                      return false;
 220                  }
 221              } elseif ($arr[$k] <= 0xffff) {
 222                  # 3 byte sequence
 223                  echo chr(0xe0 | ($arr[$k] >> 12));
 224                  echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 225                  echo chr(0x80 | ($arr[$k] & 0x003f));
 226              } elseif ($arr[$k] <= 0x10ffff) {
 227                  # 4 byte sequence
 228                  echo chr(0xf0 | ($arr[$k] >> 18));
 229                  echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 230                  echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 231                  echo chr(0x80 | ($arr[$k] & 0x3f));
 232              } elseif ($strict) {
 233                  trigger_error(
 234                      'unicode_to_utf8: Codepoint out of Unicode range ' .
 235                      'at index: ' . $k . ', value: ' . $arr[$k],
 236                      E_USER_WARNING
 237                  );
 238  
 239                  // out of range
 240                  return false;
 241              }
 242          }
 243  
 244          return ob_get_clean();
 245      }
 246  }