[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Conversion.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Methods to convert from and to UTF-8 strings
   7   */
   8  class Conversion
   9  {
  10      /**
  11       * Encodes UTF-8 characters to HTML entities
  12       *
  13       * @author Tom N Harris <tnharris@whoopdedo.org>
  14       * @author <vpribish at shopping dot com>
  15       * @link   http://php.net/manual/en/function.utf8-decode.php
  16       *
  17       * @param string $str
  18       * @param bool $all Encode non-utf8 char to HTML as well
  19       * @return string
  20       */
  21      public static function toHtml($str, $all = false)
  22      {
  23          $ret = '';
  24          foreach (Unicode::fromUtf8($str) as $cp) {
  25              if ($cp < 0x80 && !$all) {
  26                  $ret .= chr($cp);
  27              } elseif ($cp < 0x100) {
  28                  $ret .= "&#$cp;";
  29              } else {
  30                  $ret .= '&#x' . dechex($cp) . ';';
  31              }
  32          }
  33          return $ret;
  34      }
  35  
  36      /**
  37       * Decodes HTML entities to UTF-8 characters
  38       *
  39       * Convert any &#..; entity to a codepoint,
  40       * The entities flag defaults to only decoding numeric entities.
  41       * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
  42       * are handled as well. Avoids the problem that would occur if you
  43       * had to decode "&amp;#38;&#38;amp;#38;"
  44       *
  45       * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
  46       * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
  47       * what it should be                   -> "&#38;&amp#38;"
  48       *
  49       * @author Tom N Harris <tnharris@whoopdedo.org>
  50       *
  51       * @param  string $str UTF-8 encoded string
  52       * @param  boolean $entities decode name entities in addtition to numeric ones
  53       * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
  54       */
  55      public static function fromHtml($str, $entities = false)
  56      {
  57          if (!$entities) {
  58              return preg_replace_callback(
  59                  '/(&#([Xx])?([0-9A-Za-z]+);)/m',
  60                  [self::class, 'decodeNumericEntity'],
  61                  $str
  62              );
  63          }
  64  
  65          return preg_replace_callback(
  66              '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
  67              [self::class, 'decodeAnyEntity'],
  68              $str
  69          );
  70      }
  71  
  72      /**
  73       * Decodes any HTML entity to it's correct UTF-8 char equivalent
  74       *
  75       * @param string $ent An entity
  76       * @return string
  77       */
  78      protected static function decodeAnyEntity($ent)
  79      {
  80          // create the named entity lookup table
  81          static $table = null;
  82          if ($table === null) {
  83              $table = get_html_translation_table(HTML_ENTITIES);
  84              $table = array_flip($table);
  85              $table = array_map(
  86                  static fn($c) => Unicode::toUtf8([ord($c)]),
  87                  $table
  88              );
  89          }
  90  
  91          if ($ent[1] === '#') {
  92              return self::decodeNumericEntity($ent);
  93          }
  94  
  95          if (array_key_exists($ent[0], $table)) {
  96              return $table[$ent[0]];
  97          }
  98  
  99          return $ent[0];
 100      }
 101  
 102      /**
 103       * Decodes numeric HTML entities to their correct UTF-8 characters
 104       *
 105       * @param $ent string A numeric entity
 106       * @return string|false
 107       */
 108      protected static function decodeNumericEntity($ent)
 109      {
 110          switch ($ent[2]) {
 111              case 'X':
 112              case 'x':
 113                  $cp = hexdec($ent[3]);
 114                  break;
 115              default:
 116                  $cp = (int) $ent[3];
 117                  break;
 118          }
 119          return Unicode::toUtf8([$cp]);
 120      }
 121  
 122      /**
 123       * UTF-8 to UTF-16BE conversion.
 124       *
 125       * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 126       *
 127       * @param string $str
 128       * @param bool $bom
 129       * @return string
 130       */
 131      public static function toUtf16be($str, $bom = false)
 132      {
 133          $out = $bom ? "\xFE\xFF" : '';
 134          if (UTF8_MBSTRING) {
 135              return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
 136          }
 137  
 138          $uni = Unicode::fromUtf8($str);
 139          foreach ($uni as $cp) {
 140              $out .= pack('n', $cp);
 141          }
 142          return $out;
 143      }
 144  
 145      /**
 146       * UTF-8 to UTF-16BE conversion.
 147       *
 148       * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 149       *
 150       * @param string $str
 151       * @return false|string
 152       */
 153      public static function fromUtf16be($str)
 154      {
 155          $uni = unpack('n*', $str);
 156          return Unicode::toUtf8($uni);
 157      }
 158  }