* @author * @link http://php.net/manual/en/function.utf8-decode.php * * @param string $str * @param bool $all Encode non-utf8 char to HTML as well * @return string */ public static function toHtml($str, $all = false) { $ret = ''; foreach (Unicode::fromUtf8($str) as $cp) { if ($cp < 0x80 && !$all) { $ret .= chr($cp); } elseif ($cp < 0x100) { $ret .= "&#$cp;"; } else { $ret .= '&#x' . dechex($cp) . ';'; } } return $ret; } /** * Decodes HTML entities to UTF-8 characters * * Convert any &#..; entity to a codepoint, * The entities flag defaults to only decoding numeric entities. * Pass HTML_ENTITIES and named entities, including & < etc. * are handled as well. Avoids the problem that would occur if you * had to decode "&#38;&amp;#38;" * * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&" * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;" * what it should be -> "&&#38;" * * @author Tom N Harris * * @param string $str UTF-8 encoded string * @param boolean $entities decode name entities in addtition to numeric ones * @return string UTF-8 encoded string with numeric (and named) entities replaced. */ public static function fromHtml($str, $entities = false) { if (!$entities) { return preg_replace_callback( '/(&#([Xx])?([0-9A-Za-z]+);)/m', [self::class, 'decodeNumericEntity'], $str ); } return preg_replace_callback( '/&(#)?([Xx])?([0-9A-Za-z]+);/m', [self::class, 'decodeAnyEntity'], $str ); } /** * Decodes any HTML entity to it's correct UTF-8 char equivalent * * @param string $ent An entity * @return string */ protected static function decodeAnyEntity($ent) { // create the named entity lookup table static $table = null; if ($table === null) { $table = get_html_translation_table(HTML_ENTITIES); $table = array_flip($table); $table = array_map( static fn($c) => Unicode::toUtf8([ord($c)]), $table ); } if ($ent[1] === '#') { return self::decodeNumericEntity($ent); } if (array_key_exists($ent[0], $table)) { return $table[$ent[0]]; } return $ent[0]; } /** * Decodes numeric HTML entities to their correct UTF-8 characters * * @param $ent string A numeric entity * @return string|false */ protected static function decodeNumericEntity($ent) { switch ($ent[2]) { case 'X': case 'x': $cp = hexdec($ent[3]); break; default: $cp = (int) $ent[3]; break; } return Unicode::toUtf8([$cp]); } /** * UTF-8 to UTF-16BE conversion. * * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits * * @param string $str * @param bool $bom * @return string */ public static function toUtf16be($str, $bom = false) { $out = $bom ? "\xFE\xFF" : ''; if (UTF8_MBSTRING) { return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8'); } $uni = Unicode::fromUtf8($str); foreach ($uni as $cp) { $out .= pack('n', $cp); } return $out; } /** * UTF-8 to UTF-16BE conversion. * * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits * * @param string $str * @return false|string */ public static function fromUtf16be($str) { $uni = unpack('n*', $str); return Unicode::toUtf8($uni); } }