PHPXRef 0.7.1 : DokuWiki : /vendor/simplepie/simplepie/idn/idna

[Summary view] [Print] [Text view]
   1  <?php
   2  // {{{ license
   3  
   4  /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
   5  //
   6  // +----------------------------------------------------------------------+
   7  // | This library is free software; you can redistribute it and/or modify |
   8  // | it under the terms of the GNU Lesser General Public License as       |
   9  // | published by the Free Software Foundation; either version 2.1 of the |
  10  // | License, or (at your option) any later version.                      |
  11  // |                                                                      |
  12  // | This library is distributed in the hope that it will be useful, but  |
  13  // | WITHOUT ANY WARRANTY; without even the implied warranty of           |
  14  // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    |
  15  // | Lesser General Public License for more details.                      |
  16  // |                                                                      |
  17  // | You should have received a copy of the GNU Lesser General Public     |
  18  // | License along with this library; if not, write to the Free Software  |
  19  // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 |
  20  // | USA.                                                                 |
  21  // +----------------------------------------------------------------------+
  22  //
  23  
  24  // }}}
  25  
  26  /**
  27   * Encode/decode Internationalized Domain Names.
  28   *
  29   * The class allows to convert internationalized domain names
  30   * (see RFC 3490 for details) as they can be used with various registries worldwide
  31   * to be translated between their original (localized) form and their encoded form
  32   * as it will be used in the DNS (Domain Name System).
  33   *
  34   * The class provides two public methods, encode() and decode(), which do exactly
  35   * what you would expect them to do. You are allowed to use complete domain names,
  36   * simple strings and complete email addresses as well. That means, that you might
  37   * use any of the following notations:
  38   *
  39   * - www.nörgler.com
  40   * - xn--nrgler-wxa
  41   * - xn--brse-5qa.xn--knrz-1ra.info
  42   *
  43   * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
  44   * array. Unicode output is available in the same formats.
  45   * You can select your preferred format via {@link set_paramter()}.
  46   *
  47   * ACE input and output is always expected to be ASCII.
  48   *
  49   * @author  Matthias Sommerfeld <mso@phlylabs.de>
  50   * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
  51   * @version 0.5.1
  52   *
  53   */
  54  class idna_convert
  55  {
  56      /**
  57       * Holds all relevant mapping tables, loaded from a seperate file on construct
  58       * See RFC3454 for details
  59       *
  60       * @var array
  61       * @access private
  62       */
  63      var $NP = array();
  64  
  65      // Internal settings, do not mess with them
  66      var $_punycode_prefix = 'xn--';
  67      var $_invalid_ucs =     0x80000000;
  68      var $_max_ucs =         0x10FFFF;
  69      var $_base =            36;
  70      var $_tmin =            1;
  71      var $_tmax =            26;
  72      var $_skew =            38;
  73      var $_damp =            700;
  74      var $_initial_bias =    72;
  75      var $_initial_n =       0x80;
  76      var $_sbase =           0xAC00;
  77      var $_lbase =           0x1100;
  78      var $_vbase =           0x1161;
  79      var $_tbase =           0x11A7;
  80      var $_lcount =          19;
  81      var $_vcount =          21;
  82      var $_tcount =          28;
  83      var $_ncount =          588;   // _vcount * _tcount
  84      var $_scount =          11172; // _lcount * _tcount * _vcount
  85      var $_error =           false;
  86  
  87      // See {@link set_paramter()} for details of how to change the following
  88      // settings from within your script / application
  89      var $_api_encoding   =  'utf8'; // Default input charset is UTF-8
  90      var $_allow_overlong =  false;  // Overlong UTF-8 encodings are forbidden
  91      var $_strict_mode    =  false;  // Behave strict or not
  92  
  93      // The constructor
  94      function __construct($options = false)
  95      {
  96          $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
  97          if (function_exists('file_get_contents')) {
  98              $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
  99          } else {
 100              $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
 101          }
 102          // If parameters are given, pass these to the respective method
 103          if (is_array($options)) {
 104              return $this->set_parameter($options);
 105          }
 106          return true;
 107      }
 108  
 109      /**
 110       * Sets a new option value. Available options and values:
 111       * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
 112       *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
 113       * [overlong - Unicode does not allow unnecessarily long encodings of chars,
 114       *             to allow this, set this parameter to true, else to false;
 115       *             default is false.]
 116       * [strict - true: strict mode, good for registration purposes - Causes errors
 117       *           on failures; false: loose mode, ideal for "wildlife" applications
 118       *           by silently ignoring errors and returning the original input instead
 119       *
 120       * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
 121       * @param    string    Value to use (if parameter 1 is a string)
 122       * @return   boolean   true on success, false otherwise
 123       * @access   public
 124       */
 125      function set_parameter($option, $value = false)
 126      {
 127          if (!is_array($option)) {
 128              $option = array($option => $value);
 129          }
 130          foreach ($option as $k => $v) {
 131              switch ($k) {
 132              case 'encoding':
 133                  switch ($v) {
 134                  case 'utf8':
 135                  case 'ucs4_string':
 136                  case 'ucs4_array':
 137                      $this->_api_encoding = $v;
 138                      break;
 139                  default:
 140                      $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
 141                      return false;
 142                  }
 143                  break;
 144              case 'overlong':
 145                  $this->_allow_overlong = ($v) ? true : false;
 146                  break;
 147              case 'strict':
 148                  $this->_strict_mode = ($v) ? true : false;
 149                  break;
 150              default:
 151                  $this->_error('Set Parameter: Unknown option '.$k);
 152                  return false;
 153              }
 154          }
 155          return true;
 156      }
 157  
 158      /**
 159       * Decode a given ACE domain name
 160       * @param    string   Domain name (ACE string)
 161       * [@param    string   Desired output encoding, see {@link set_parameter}]
 162       * @return   string   Decoded Domain name (UTF-8 or UCS-4)
 163       * @access   public
 164       */
 165      function decode($input, $one_time_encoding = false)
 166      {
 167          // Optionally set
 168          if ($one_time_encoding) {
 169              switch ($one_time_encoding) {
 170              case 'utf8':
 171              case 'ucs4_string':
 172              case 'ucs4_array':
 173                  break;
 174              default:
 175                  $this->_error('Unknown encoding '.$one_time_encoding);
 176                  return false;
 177              }
 178          }
 179          // Make sure to drop any newline characters around
 180          $input = trim($input);
 181  
 182          // Negotiate input and try to determine, whether it is a plain string,
 183          // an email address or something like a complete URL
 184          if (strpos($input, '@')) { // Maybe it is an email address
 185              // No no in strict mode
 186              if ($this->_strict_mode) {
 187                  $this->_error('Only simple domain name parts can be handled in strict mode');
 188                  return false;
 189              }
 190              list ($email_pref, $input) = explode('@', $input, 2);
 191              $arr = explode('.', $input);
 192              foreach ($arr as $k => $v) {
 193                  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
 194                      $conv = $this->_decode($v);
 195                      if ($conv) $arr[$k] = $conv;
 196                  }
 197              }
 198              $input = join('.', $arr);
 199              $arr = explode('.', $email_pref);
 200              foreach ($arr as $k => $v) {
 201                  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
 202                      $conv = $this->_decode($v);
 203                      if ($conv) $arr[$k] = $conv;
 204                  }
 205              }
 206              $email_pref = join('.', $arr);
 207              $return = $email_pref . '@' . $input;
 208          } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
 209              // No no in strict mode
 210              if ($this->_strict_mode) {
 211                  $this->_error('Only simple domain name parts can be handled in strict mode');
 212                  return false;
 213              }
 214              $parsed = parse_url($input);
 215              if (isset($parsed['host'])) {
 216                  $arr = explode('.', $parsed['host']);
 217                  foreach ($arr as $k => $v) {
 218                      $conv = $this->_decode($v);
 219                      if ($conv) $arr[$k] = $conv;
 220                  }
 221                  $parsed['host'] = join('.', $arr);
 222                  $return =
 223                          (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
 224                          .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
 225                          .$parsed['host']
 226                          .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
 227                          .(empty($parsed['path']) ? '' : $parsed['path'])
 228                          .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
 229                          .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
 230              } else { // parse_url seems to have failed, try without it
 231                  $arr = explode('.', $input);
 232                  foreach ($arr as $k => $v) {
 233                      $conv = $this->_decode($v);
 234                      $arr[$k] = ($conv) ? $conv : $v;
 235                  }
 236                  $return = join('.', $arr);
 237              }
 238          } else { // Otherwise we consider it being a pure domain name string
 239              $return = $this->_decode($input);
 240              if (!$return) $return = $input;
 241          }
 242          // The output is UTF-8 by default, other output formats need conversion here
 243          // If one time encoding is given, use this, else the objects property
 244          switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
 245          case 'utf8':
 246              return $return;
 247              break;
 248          case 'ucs4_string':
 249             return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
 250             break;
 251          case 'ucs4_array':
 252              return $this->_utf8_to_ucs4($return);
 253              break;
 254          default:
 255              $this->_error('Unsupported output format');
 256              return false;
 257          }
 258      }
 259  
 260      /**
 261       * Encode a given UTF-8 domain name
 262       * @param    string   Domain name (UTF-8 or UCS-4)
 263       * [@param    string   Desired input encoding, see {@link set_parameter}]
 264       * @return   string   Encoded Domain name (ACE string)
 265       * @access   public
 266       */
 267      function encode($decoded, $one_time_encoding = false)
 268      {
 269          // Forcing conversion of input to UCS4 array
 270          // If one time encoding is given, use this, else the objects property
 271          switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
 272          case 'utf8':
 273              $decoded = $this->_utf8_to_ucs4($decoded);
 274              break;
 275          case 'ucs4_string':
 276             $decoded = $this->_ucs4_string_to_ucs4($decoded);
 277          case 'ucs4_array':
 278             break;
 279          default:
 280              $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
 281              return false;
 282          }
 283  
 284          // No input, no output, what else did you expect?
 285          if (empty($decoded)) return '';
 286  
 287          // Anchors for iteration
 288          $last_begin = 0;
 289          // Output string
 290          $output = '';
 291          foreach ($decoded as $k => $v) {
 292              // Make sure to use just the plain dot
 293              switch($v) {
 294              case 0x3002:
 295              case 0xFF0E:
 296              case 0xFF61:
 297                  $decoded[$k] = 0x2E;
 298                  // Right, no break here, the above are converted to dots anyway
 299              // Stumbling across an anchoring character
 300              case 0x2E:
 301              case 0x2F:
 302              case 0x3A:
 303              case 0x3F:
 304              case 0x40:
 305                  // Neither email addresses nor URLs allowed in strict mode
 306                  if ($this->_strict_mode) {
 307                     $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
 308                     return false;
 309                  }
 310  
 311                  // Skip first char
 312                  if ($k) {
 313                      $encoded = '';
 314                      $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
 315                      if ($encoded) {
 316                          $output .= $encoded;
 317                      } else {
 318                          $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
 319                      }
 320                      $output .= chr($decoded[$k]);
 321                  }
 322                  $last_begin = $k + 1;
 323              }
 324          }
 325          // Catch the rest of the string
 326          if ($last_begin) {
 327              $inp_len = sizeof($decoded);
 328              $encoded = '';
 329              $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
 330              if ($encoded) {
 331                  $output .= $encoded;
 332              } else {
 333                  $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
 334              }
 335              return $output;
 336          }
 337  
 338          if ($output = $this->_encode($decoded)) {
 339              return $output;
 340          }
 341  
 342          return $this->_ucs4_to_utf8($decoded);
 343      }
 344  
 345      /**
 346       * Use this method to get the last error ocurred
 347       * @param    void
 348       * @return   string   The last error, that occured
 349       * @access   public
 350       */
 351      function get_last_error()
 352      {
 353          return $this->_error;
 354      }
 355  
 356      /**
 357       * The actual decoding algorithm
 358       * @access   private
 359       */
 360      function _decode($encoded)
 361      {
 362          // We do need to find the Punycode prefix
 363          if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
 364              $this->_error('This is not a punycode string');
 365              return false;
 366          }
 367          $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
 368          // If nothing left after removing the prefix, it is hopeless
 369          if (!$encode_test) {
 370              $this->_error('The given encoded string was empty');
 371              return false;
 372          }
 373          // Find last occurence of the delimiter
 374          $delim_pos = strrpos($encoded, '-');
 375          if ($delim_pos > strlen($this->_punycode_prefix)) {
 376              for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
 377                  $decoded[] = ord($encoded[$k]);
 378              }
 379          } else {
 380              $decoded = array();
 381          }
 382          $deco_len = count($decoded);
 383          $enco_len = strlen($encoded);
 384  
 385          // Wandering through the strings; init
 386          $is_first = true;
 387          $bias     = $this->_initial_bias;
 388          $idx      = 0;
 389          $char     = $this->_initial_n;
 390  
 391          for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
 392              for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
 393                  $digit = $this->_decode_digit($encoded[$enco_idx++]);
 394                  $idx += $digit * $w;
 395                  $t = ($k <= $bias) ? $this->_tmin :
 396                          (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
 397                  if ($digit < $t) break;
 398                  $w = (int) ($w * ($this->_base - $t));
 399              }
 400              $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
 401              $is_first = false;
 402              $char += (int) ($idx / ($deco_len + 1));
 403              $idx %= ($deco_len + 1);
 404              if ($deco_len > 0) {
 405                  // Make room for the decoded char
 406                  for ($i = $deco_len; $i > $idx; $i--) {
 407                      $decoded[$i] = $decoded[($i - 1)];
 408                  }
 409              }
 410              $decoded[$idx++] = $char;
 411          }
 412          return $this->_ucs4_to_utf8($decoded);
 413      }
 414  
 415      /**
 416       * The actual encoding algorithm
 417       * @access   private
 418       */
 419      function _encode($decoded)
 420      {
 421          // We cannot encode a domain name containing the Punycode prefix
 422          $extract = strlen($this->_punycode_prefix);
 423          $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
 424          $check_deco = array_slice($decoded, 0, $extract);
 425  
 426          if ($check_pref == $check_deco) {
 427              $this->_error('This is already a punycode string');
 428              return false;
 429          }
 430          // We will not try to encode strings consisting of basic code points only
 431          $encodable = false;
 432          foreach ($decoded as $k => $v) {
 433              if ($v > 0x7a) {
 434                  $encodable = true;
 435                  break;
 436              }
 437          }
 438          if (!$encodable) {
 439              $this->_error('The given string does not contain encodable chars');
 440              return false;
 441          }
 442  
 443          // Do NAMEPREP
 444          $decoded = $this->_nameprep($decoded);
 445          if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
 446  
 447          $deco_len  = count($decoded);
 448          if (!$deco_len) return false; // Empty array
 449  
 450          $codecount = 0; // How many chars have been consumed
 451  
 452          $encoded = '';
 453          // Copy all basic code points to output
 454          for ($i = 0; $i < $deco_len; ++$i) {
 455              $test = $decoded[$i];
 456              // Will match [-0-9a-zA-Z]
 457              if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
 458                      || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
 459                  $encoded .= chr($decoded[$i]);
 460                  $codecount++;
 461              }
 462          }
 463          if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
 464  
 465          // Start with the prefix; copy it to output
 466          $encoded = $this->_punycode_prefix.$encoded;
 467  
 468          // If we have basic code points in output, add an hyphen to the end
 469          if ($codecount) $encoded .= '-';
 470  
 471          // Now find and encode all non-basic code points
 472          $is_first  = true;
 473          $cur_code  = $this->_initial_n;
 474          $bias      = $this->_initial_bias;
 475          $delta     = 0;
 476          while ($codecount < $deco_len) {
 477              // Find the smallest code point >= the current code point and
 478              // remember the last ouccrence of it in the input
 479              for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
 480                  if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
 481                      $next_code = $decoded[$i];
 482                  }
 483              }
 484  
 485              $delta += ($next_code - $cur_code) * ($codecount + 1);
 486              $cur_code = $next_code;
 487  
 488              // Scan input again and encode all characters whose code point is $cur_code
 489              for ($i = 0; $i < $deco_len; $i++) {
 490                  if ($decoded[$i] < $cur_code) {
 491                      $delta++;
 492                  } elseif ($decoded[$i] == $cur_code) {
 493                      for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
 494                          $t = ($k <= $bias) ? $this->_tmin :
 495                                  (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
 496                          if ($q < $t) break;
 497                          $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
 498                          $q = (int) (($q - $t) / ($this->_base - $t));
 499                      }
 500                      $encoded .= $this->_encode_digit($q);
 501                      $bias = $this->_adapt($delta, $codecount+1, $is_first);
 502                      $codecount++;
 503                      $delta = 0;
 504                      $is_first = false;
 505                  }
 506              }
 507              $delta++;
 508              $cur_code++;
 509          }
 510          return $encoded;
 511      }
 512  
 513      /**
 514       * Adapt the bias according to the current code point and position
 515       * @access   private
 516       */
 517      function _adapt($delta, $npoints, $is_first)
 518      {
 519          $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
 520          $delta += intval($delta / $npoints);
 521          for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
 522              $delta = intval($delta / ($this->_base - $this->_tmin));
 523          }
 524          return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
 525      }
 526  
 527      /**
 528       * Encoding a certain digit
 529       * @access   private
 530       */
 531      function _encode_digit($d)
 532      {
 533          return chr($d + 22 + 75 * ($d < 26));
 534      }
 535  
 536      /**
 537       * Decode a certain digit
 538       * @access   private
 539       */
 540      function _decode_digit($cp)
 541      {
 542          $cp = ord($cp);
 543          return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
 544      }
 545  
 546      /**
 547       * Internal error handling method
 548       * @access   private
 549       */
 550      function _error($error = '')
 551      {
 552          $this->_error = $error;
 553      }
 554  
 555      /**
 556       * Do Nameprep according to RFC3491 and RFC3454
 557       * @param    array    Unicode Characters
 558       * @return   string   Unicode Characters, Nameprep'd
 559       * @access   private
 560       */
 561      function _nameprep($input)
 562      {
 563          $output = array();
 564          $error = false;
 565          //
 566          // Mapping
 567          // Walking through the input array, performing the required steps on each of
 568          // the input chars and putting the result into the output array
 569          // While mapping required chars we apply the cannonical ordering
 570          foreach ($input as $v) {
 571              // Map to nothing == skip that code point
 572              if (in_array($v, $this->NP['map_nothing'])) continue;
 573  
 574              // Try to find prohibited input
 575              if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
 576                  $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
 577                  return false;
 578              }
 579              foreach ($this->NP['prohibit_ranges'] as $range) {
 580                  if ($range[0] <= $v && $v <= $range[1]) {
 581                      $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
 582                      return false;
 583                  }
 584              }
 585              //
 586              // Hangul syllable decomposition
 587              if (0xAC00 <= $v && $v <= 0xD7AF) {
 588                  foreach ($this->_hangul_decompose($v) as $out) {
 589                      $output[] = (int) $out;
 590                  }
 591              // There's a decomposition mapping for that code point
 592              } elseif (isset($this->NP['replacemaps'][$v])) {
 593                  foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
 594                      $output[] = (int) $out;
 595                  }
 596              } else {
 597                  $output[] = (int) $v;
 598              }
 599          }
 600          // Before applying any Combining, try to rearrange any Hangul syllables
 601          $output = $this->_hangul_compose($output);
 602          //
 603          // Combine code points
 604          //
 605          $last_class   = 0;
 606          $last_starter = 0;
 607          $out_len      = count($output);
 608          for ($i = 0; $i < $out_len; ++$i) {
 609              $class = $this->_get_combining_class($output[$i]);
 610              if ((!$last_class || $last_class > $class) && $class) {
 611                  // Try to match
 612                  $seq_len = $i - $last_starter;
 613                  $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
 614                  // On match: Replace the last starter with the composed character and remove
 615                  // the now redundant non-starter(s)
 616                  if ($out) {
 617                      $output[$last_starter] = $out;
 618                      if (count($out) != $seq_len) {
 619                          for ($j = $i+1; $j < $out_len; ++$j) {
 620                              $output[$j-1] = $output[$j];
 621                          }
 622                          unset($output[$out_len]);
 623                      }
 624                      // Rewind the for loop by one, since there can be more possible compositions
 625                      $i--;
 626                      $out_len--;
 627                      $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
 628                      continue;
 629                  }
 630              }
 631              // The current class is 0
 632              if (!$class) $last_starter = $i;
 633              $last_class = $class;
 634          }
 635          return $output;
 636      }
 637  
 638      /**
 639       * Decomposes a Hangul syllable
 640       * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
 641       * @param    integer  32bit UCS4 code point
 642       * @return   array    Either Hangul Syllable decomposed or original 32bit value as one value array
 643       * @access   private
 644       */
 645      function _hangul_decompose($char)
 646      {
 647          $sindex = (int) $char - $this->_sbase;
 648          if ($sindex < 0 || $sindex >= $this->_scount) {
 649              return array($char);
 650          }
 651          $result = array();
 652          $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
 653          $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
 654          $T = intval($this->_tbase + $sindex % $this->_tcount);
 655          if ($T != $this->_tbase) $result[] = $T;
 656          return $result;
 657      }
 658      /**
 659       * Ccomposes a Hangul syllable
 660       * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
 661       * @param    array    Decomposed UCS4 sequence
 662       * @return   array    UCS4 sequence with syllables composed
 663       * @access   private
 664       */
 665      function _hangul_compose($input)
 666      {
 667          $inp_len = count($input);
 668          if (!$inp_len) return array();
 669          $result = array();
 670          $last = (int) $input[0];
 671          $result[] = $last; // copy first char from input to output
 672  
 673          for ($i = 1; $i < $inp_len; ++$i) {
 674              $char = (int) $input[$i];
 675              $sindex = $last - $this->_sbase;
 676              $lindex = $last - $this->_lbase;
 677              $vindex = $char - $this->_vbase;
 678              $tindex = $char - $this->_tbase;
 679              // Find out, whether two current characters are LV and T
 680              if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
 681                      && 0 <= $tindex && $tindex <= $this->_tcount) {
 682                  // create syllable of form LVT
 683                  $last += $tindex;
 684                  $result[(count($result) - 1)] = $last; // reset last
 685                  continue; // discard char
 686              }
 687              // Find out, whether two current characters form L and V
 688              if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
 689                  // create syllable of form LV
 690                  $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
 691                  $result[(count($result) - 1)] = $last; // reset last
 692                  continue; // discard char
 693              }
 694              // if neither case was true, just add the character
 695              $last = $char;
 696              $result[] = $char;
 697          }
 698          return $result;
 699      }
 700  
 701      /**
 702       * Returns the combining class of a certain wide char
 703       * @param    integer    Wide char to check (32bit integer)
 704       * @return   integer    Combining class if found, else 0
 705       * @access   private
 706       */
 707      function _get_combining_class($char)
 708      {
 709          return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
 710      }
 711  
 712      /**
 713       * Apllies the cannonical ordering of a decomposed UCS4 sequence
 714       * @param    array      Decomposed UCS4 sequence
 715       * @return   array      Ordered USC4 sequence
 716       * @access   private
 717       */
 718      function _apply_cannonical_ordering($input)
 719      {
 720          $swap = true;
 721          $size = count($input);
 722          while ($swap) {
 723              $swap = false;
 724              $last = $this->_get_combining_class(intval($input[0]));
 725              for ($i = 0; $i < $size-1; ++$i) {
 726                  $next = $this->_get_combining_class(intval($input[$i+1]));
 727                  if ($next != 0 && $last > $next) {
 728                      // Move item leftward until it fits
 729                      for ($j = $i + 1; $j > 0; --$j) {
 730                          if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
 731                          $t = intval($input[$j]);
 732                          $input[$j] = intval($input[$j-1]);
 733                          $input[$j-1] = $t;
 734                          $swap = true;
 735                      }
 736                      // Reentering the loop looking at the old character again
 737                      $next = $last;
 738                  }
 739                  $last = $next;
 740              }
 741          }
 742          return $input;
 743      }
 744  
 745      /**
 746       * Do composition of a sequence of starter and non-starter
 747       * @param    array      UCS4 Decomposed sequence
 748       * @return   array      Ordered USC4 sequence
 749       * @access   private
 750       */
 751      function _combine($input)
 752      {
 753          $inp_len = count($input);
 754          foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
 755              if ($np_target[0] != $input[0]) continue;
 756              if (count($np_target) != $inp_len) continue;
 757              $hit = false;
 758              foreach ($input as $k2 => $v2) {
 759                  if ($v2 == $np_target[$k2]) {
 760                      $hit = true;
 761                  } else {
 762                      $hit = false;
 763                      break;
 764                  }
 765              }
 766              if ($hit) return $np_src;
 767          }
 768          return false;
 769      }
 770  
 771      /**
 772       * This converts an UTF-8 encoded string to its UCS-4 representation
 773       * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
 774       * each of the "chars". This is due to PHP not being able to handle strings with
 775       * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
 776       * The following UTF-8 encodings are supported:
 777       * bytes bits  representation
 778       * 1        7  0xxxxxxx
 779       * 2       11  110xxxxx 10xxxxxx
 780       * 3       16  1110xxxx 10xxxxxx 10xxxxxx
 781       * 4       21  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 782       * 5       26  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 783       * 6       31  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 784       * Each x represents a bit that can be used to store character data.
 785       * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
 786       * @access   private
 787       */
 788      function _utf8_to_ucs4($input)
 789      {
 790          $output = array();
 791          $out_len = 0;
 792          $inp_len = strlen($input);
 793          $mode = 'next';
 794          $test = 'none';
 795          for ($k = 0; $k < $inp_len; ++$k) {
 796              $v = ord($input[$k]); // Extract byte from input string
 797  
 798              if ($v < 128) { // We found an ASCII char - put into stirng as is
 799                  $output[$out_len] = $v;
 800                  ++$out_len;
 801                  if ('add' == $mode) {
 802                      $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
 803                      return false;
 804                  }
 805                  continue;
 806              }
 807              if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
 808                  $start_byte = $v;
 809                  $mode = 'add';
 810                  $test = 'range';
 811                  if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
 812                      $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
 813                      $v = ($v - 192) << 6;
 814                  } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
 815                      $next_byte = 1;
 816                      $v = ($v - 224) << 12;
 817                  } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 818                      $next_byte = 2;
 819                      $v = ($v - 240) << 18;
 820                  } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 821                      $next_byte = 3;
 822                      $v = ($v - 248) << 24;
 823                  } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 824                      $next_byte = 4;
 825                      $v = ($v - 252) << 30;
 826                  } else {
 827                      $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
 828                      return false;
 829                  }
 830                  if ('add' == $mode) {
 831                      $output[$out_len] = (int) $v;
 832                      ++$out_len;
 833                      continue;
 834                  }
 835              }
 836              if ('add' == $mode) {
 837                  if (!$this->_allow_overlong && $test == 'range') {
 838                      $test = 'none';
 839                      if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
 840                          $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
 841                          return false;
 842                      }
 843                  }
 844                  if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
 845                      $v = ($v - 128) << ($next_byte * 6);
 846                      $output[($out_len - 1)] += $v;
 847                      --$next_byte;
 848                  } else {
 849                      $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
 850                      return false;
 851                  }
 852                  if ($next_byte < 0) {
 853                      $mode = 'next';
 854                  }
 855              }
 856          } // for
 857          return $output;
 858      }
 859  
 860      /**
 861       * Convert UCS-4 string into UTF-8 string
 862       * See _utf8_to_ucs4() for details
 863       * @access   private
 864       */
 865      function _ucs4_to_utf8($input)
 866      {
 867          $output = '';
 868          $k = 0;
 869          foreach ($input as $v) {
 870              ++$k;
 871              // $v = ord($v);
 872              if ($v < 128) { // 7bit are transferred literally
 873                  $output .= chr($v);
 874              } elseif ($v < (1 << 11)) { // 2 bytes
 875                  $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
 876              } elseif ($v < (1 << 16)) { // 3 bytes
 877                  $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 878              } elseif ($v < (1 << 21)) { // 4 bytes
 879                  $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
 880                           . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 881              } elseif ($v < (1 << 26)) { // 5 bytes
 882                  $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
 883                           . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
 884                           . chr(128 + ($v & 63));
 885              } elseif ($v < (1 << 31)) { // 6 bytes
 886                  $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
 887                           . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
 888                           . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
 889              } else {
 890                  $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
 891                  return false;
 892              }
 893          }
 894          return $output;
 895      }
 896  
 897      /**
 898        * Convert UCS-4 array into UCS-4 string
 899        *
 900        * @access   private
 901        */
 902      function _ucs4_to_ucs4_string($input)
 903      {
 904          $output = '';
 905          // Take array values and split output to 4 bytes per value
 906          // The bit mask is 255, which reads &11111111
 907          foreach ($input as $v) {
 908              $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
 909          }
 910          return $output;
 911      }
 912  
 913      /**
 914        * Convert UCS-4 strin into UCS-4 garray
 915        *
 916        * @access   private
 917        */
 918      function _ucs4_string_to_ucs4($input)
 919      {
 920          $output = array();
 921          $inp_len = strlen($input);
 922          // Input length must be dividable by 4
 923          if ($inp_len % 4) {
 924              $this->_error('Input UCS4 string is broken');
 925              return false;
 926          }
 927          // Empty input - return empty output
 928          if (!$inp_len) return $output;
 929          for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
 930              // Increment output position every 4 input bytes
 931              if (!($i % 4)) {
 932                  $out_len++;
 933                  $output[$out_len] = 0;
 934              }
 935              $output[$out_len] += ord($input[$i]) << (8 * (3 - ($i % 4) ) );
 936          }
 937          return $output;
 938      }
 939  }
 940  
 941  /**
 942  * Adapter class for aligning the API of idna_convert with that of Net_IDNA
 943  * @author  Matthias Sommerfeld <mso@phlylabs.de>
 944  */
 945  class Net_IDNA_php4 extends idna_convert
 946  {
 947      /**
 948       * Sets a new option value. Available options and values:
 949       * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
 950       *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
 951       * [overlong - Unicode does not allow unnecessarily long encodings of chars,
 952       *             to allow this, set this parameter to true, else to false;
 953       *             default is false.]
 954       * [strict - true: strict mode, good for registration purposes - Causes errors
 955       *           on failures; false: loose mode, ideal for "wildlife" applications
 956       *           by silently ignoring errors and returning the original input instead
 957       *
 958       * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
 959       * @param    string    Value to use (if parameter 1 is a string)
 960       * @return   boolean   true on success, false otherwise
 961       * @access   public
 962       */
 963      function setParams($option, $param = false)
 964      {
 965          return $this->IC->set_parameters($option, $param);
 966      }
 967  }
 968  
 969  ?>
PHP Cross Reference of DokuWiki

/vendor/simplepie/simplepie/idn/ -> idna_convert.class.php (source)

Global DokuWiki Links