[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Conversion.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Methods to convert from and to UTF-8 strings
   7   */
   8  class Conversion
   9  {
  10  
  11      /**
  12       * Encodes UTF-8 characters to HTML entities
  13       *
  14       * @author Tom N Harris <tnharris@whoopdedo.org>
  15       * @author <vpribish at shopping dot com>
  16       * @link   http://php.net/manual/en/function.utf8-decode.php
  17       *
  18       * @param string $str
  19       * @param bool $all Encode non-utf8 char to HTML as well
  20       * @return string
  21       */
  22      public static function toHtml($str, $all = false)
  23      {
  24          $ret = '';
  25          foreach (Unicode::fromUtf8($str) as $cp) {
  26              if ($cp < 0x80 && !$all) {
  27                  $ret .= chr($cp);
  28              } elseif ($cp < 0x100) {
  29                  $ret .= "&#$cp;";
  30              } else {
  31                  $ret .= '&#x' . dechex($cp) . ';';
  32              }
  33          }
  34          return $ret;
  35      }
  36  
  37      /**
  38       * Decodes HTML entities to UTF-8 characters
  39       *
  40       * Convert any &#..; entity to a codepoint,
  41       * The entities flag defaults to only decoding numeric entities.
  42       * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
  43       * are handled as well. Avoids the problem that would occur if you
  44       * had to decode "&amp;#38;&#38;amp;#38;"
  45       *
  46       * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
  47       * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
  48       * what it should be                   -> "&#38;&amp#38;"
  49       *
  50       * @author Tom N Harris <tnharris@whoopdedo.org>
  51       *
  52       * @param  string $str UTF-8 encoded string
  53       * @param  boolean $entities decode name entities in addtition to numeric ones
  54       * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
  55       */
  56      public static function fromHtml($str, $entities = false)
  57      {
  58          if (!$entities) {
  59              return preg_replace_callback(
  60                  '/(&#([Xx])?([0-9A-Za-z]+);)/m',
  61                  [__CLASS__, 'decodeNumericEntity'],
  62                  $str
  63              );
  64          }
  65  
  66          return preg_replace_callback(
  67              '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
  68              [__CLASS__, 'decodeAnyEntity'],
  69              $str
  70          );
  71      }
  72  
  73      /**
  74       * Decodes any HTML entity to it's correct UTF-8 char equivalent
  75       *
  76       * @param string $ent An entity
  77       * @return string
  78       */
  79      protected static function decodeAnyEntity($ent)
  80      {
  81          // create the named entity lookup table
  82          static $table = null;
  83          if ($table === null) {
  84              $table = get_html_translation_table(HTML_ENTITIES);
  85              $table = array_flip($table);
  86              $table = array_map(
  87                  static function ($c) {
  88                      return Unicode::toUtf8(array(ord($c)));
  89                  },
  90                  $table
  91              );
  92          }
  93  
  94          if ($ent[1] === '#') {
  95              return self::decodeNumericEntity($ent);
  96          }
  97  
  98          if (array_key_exists($ent[0], $table)) {
  99              return $table[$ent[0]];
 100          }
 101  
 102          return $ent[0];
 103      }
 104  
 105      /**
 106       * Decodes numeric HTML entities to their correct UTF-8 characters
 107       *
 108       * @param $ent string A numeric entity
 109       * @return string|false
 110       */
 111      protected static function decodeNumericEntity($ent)
 112      {
 113          switch ($ent[2]) {
 114              case 'X':
 115              case 'x':
 116                  $cp = hexdec($ent[3]);
 117                  break;
 118              default:
 119                  $cp = intval($ent[3]);
 120                  break;
 121          }
 122          return Unicode::toUtf8(array($cp));
 123      }
 124  
 125      /**
 126       * UTF-8 to UTF-16BE conversion.
 127       *
 128       * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 129       *
 130       * @param string $str
 131       * @param bool $bom
 132       * @return string
 133       */
 134      public static function toUtf16be($str, $bom = false)
 135      {
 136          $out = $bom ? "\xFE\xFF" : '';
 137          if (UTF8_MBSTRING) {
 138              return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
 139          }
 140  
 141          $uni = Unicode::fromUtf8($str);
 142          foreach ($uni as $cp) {
 143              $out .= pack('n', $cp);
 144          }
 145          return $out;
 146      }
 147  
 148      /**
 149       * UTF-8 to UTF-16BE conversion.
 150       *
 151       * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 152       *
 153       * @param string $str
 154       * @return false|string
 155       */
 156      public static function fromUtf16be($str)
 157      {
 158          $uni = unpack('n*', $str);
 159          return Unicode::toUtf8($uni);
 160      }
 161  
 162  }