[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Clean.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Methods to assess and clean UTF-8 strings
   7   */
   8  class Clean
   9  {
  10      /**
  11       * Checks if a string contains 7bit ASCII only
  12       *
  13       * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
  14       *
  15       * @param string $str
  16       * @return bool
  17       */
  18      public static function isASCII($str)
  19      {
  20          return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
  21      }
  22  
  23      /**
  24       * Tries to detect if a string is in Unicode encoding
  25       *
  26       * @author <bmorel@ssi.fr>
  27       * @link   http://php.net/manual/en/function.utf8-encode.php
  28       *
  29       * @param string $str
  30       * @return bool
  31       */
  32      public static function isUtf8($str)
  33      {
  34          $len = strlen($str);
  35          for ($i = 0; $i < $len; $i++) {
  36              $b = ord($str[$i]);
  37              if ($b < 0x80) continue; # 0bbbbbbb
  38              elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
  39              elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
  40              elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
  41              elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
  42              elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
  43              else return false; # Does not match any model
  44  
  45              for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
  46                  if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
  47                      return false;
  48              }
  49          }
  50          return true;
  51      }
  52  
  53      /**
  54       * Strips all high byte chars
  55       *
  56       * Returns a pure ASCII7 string
  57       *
  58       * @author Andreas Gohr <andi@splitbrain.org>
  59       *
  60       * @param string $str
  61       * @return string
  62       */
  63      public static function strip($str)
  64      {
  65          $ascii = '';
  66          $len = strlen($str);
  67          for ($i = 0; $i < $len; $i++) {
  68              if (ord($str[$i]) < 128) {
  69                  $ascii .= $str[$i];
  70              }
  71          }
  72          return $ascii;
  73      }
  74  
  75      /**
  76       * Removes special characters (nonalphanumeric) from a UTF-8 string
  77       *
  78       * This function adds the controlchars 0x00 to 0x19 to the array of
  79       * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
  80       *
  81       * @author Andreas Gohr <andi@splitbrain.org>
  82       *
  83       * @param  string $string The UTF8 string to strip of special chars
  84       * @param  string $repl Replace special with this string
  85       * @param  string $additional Additional chars to strip (used in regexp char class)
  86       * @return string
  87       */
  88      public static function stripspecials($string, $repl = '', $additional = '')
  89      {
  90          static $specials = null;
  91          if ($specials === null) {
  92              $specials = preg_quote(Table::specialChars(), '/');
  93          }
  94  
  95          return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
  96      }
  97  
  98      /**
  99       * Replace bad bytes with an alternative character
 100       *
 101       * ASCII character is recommended for replacement char
 102       *
 103       * PCRE Pattern to locate bad bytes in a UTF-8 string
 104       * Comes from W3 FAQ: Multilingual Forms
 105       * Note: modified to include full ASCII range including control chars
 106       *
 107       * @author Harry Fuecks <hfuecks@gmail.com>
 108       * @see http://www.w3.org/International/questions/qa-forms-utf-8
 109       *
 110       * @param string $str to search
 111       * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
 112       * @return string
 113       */
 114      public static function replaceBadBytes($str, $replace = '')
 115      {
 116          $UTF8_BAD =
 117              '([\x00-\x7F]' .                          # ASCII (including control chars)
 118              '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
 119              '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
 120              '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
 121              '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
 122              '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
 123              '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
 124              '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
 125              '|(.{1}))';                               # invalid byte
 126          ob_start();
 127          while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
 128              if (!isset($matches[2])) {
 129                  echo $matches[0];
 130              } else {
 131                  echo $replace;
 132              }
 133              $str = substr($str, strlen($matches[0]));
 134          }
 135          return ob_get_clean();
 136      }
 137  
 138  
 139      /**
 140       * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 141       *
 142       * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 143       * letters. Default is to deaccent both cases ($case = 0)
 144       *
 145       * @author Andreas Gohr <andi@splitbrain.org>
 146       *
 147       * @param string $string
 148       * @param int $case
 149       * @return string
 150       */
 151      public static function deaccent($string, $case = 0)
 152      {
 153          if ($case <= 0) {
 154              $string = strtr($string, Table::lowerAccents());
 155          }
 156          if ($case >= 0) {
 157              $string = strtr($string, Table::upperAccents());
 158          }
 159          return $string;
 160      }
 161  
 162      /**
 163       * Romanize a non-latin string
 164       *
 165       * @author Andreas Gohr <andi@splitbrain.org>
 166       *
 167       * @param string $string
 168       * @return string
 169       */
 170      public static function romanize($string)
 171      {
 172          if (self::isASCII($string)) return $string; //nothing to do
 173  
 174          return strtr($string, Table::romanization());
 175      }
 176  
 177      /**
 178       * adjust a byte index into a utf8 string to a utf8 character boundary
 179       *
 180       * @author       chris smith <chris@jalakai.co.uk>
 181       *
 182       * @param string $str utf8 character string
 183       * @param int $i byte index into $str
 184       * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
 185       * @return int byte index into $str now pointing to a utf8 character boundary
 186       */
 187      public static function correctIdx($str, $i, $next = false)
 188      {
 189  
 190          if ($i <= 0) return 0;
 191  
 192          $limit = strlen($str);
 193          if ($i >= $limit) return $limit;
 194  
 195          if ($next) {
 196              while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
 197          } else {
 198              while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
 199          }
 200  
 201          return $i;
 202      }
 203  }