[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Utf8/ -> Asian.php (source)

   1  <?php
   2  
   3  namespace dokuwiki\Utf8;
   4  
   5  /**
   6   * Methods and constants to handle Asian "words"
   7   *
   8   * This uses a crude regexp to determine which parts of an Asian string should be treated as words.
   9   * This is necessary because in some Asian languages a single unicode char represents a whole idea
  10   * without spaces separating them.
  11   */
  12  class Asian
  13  {
  14  
  15      /**
  16       * This defines a non-capturing group for the use in regular expressions to match any asian character that
  17       * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
  18       * http://en.wikipedia.org/wiki/Unicode_block
  19       */
  20      const REGEXP =
  21          '(?:' .
  22  
  23          '[\x{0E00}-\x{0E7F}]' . // Thai
  24  
  25          '|' .
  26  
  27          '[' .
  28          '\x{2E80}-\x{3040}' .  // CJK -> Hangul
  29          '\x{309D}-\x{30A0}' .
  30          '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' .
  31          '\x{F900}-\x{FAFF}' .  // CJK Compatibility Ideographs
  32          '\x{FE30}-\x{FE4F}' .  // CJK Compatibility Forms
  33          "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B
  34          "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C
  35          "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D
  36          "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement
  37          ']' .
  38  
  39          '|' .
  40  
  41          '[' .                // Hiragana/Katakana (can be two characters)
  42          '\x{3042}\x{3044}\x{3046}\x{3048}' .
  43          '\x{304A}-\x{3062}\x{3064}-\x{3082}' .
  44          '\x{3084}\x{3086}\x{3088}-\x{308D}' .
  45          '\x{308F}-\x{3094}' .
  46          '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' .
  47          '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' .
  48          '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' .
  49          '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' .
  50          '][' .
  51          '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' .
  52          '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' .
  53          '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' .
  54          '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' .
  55          '\x{31F0}-\x{31FF}' .
  56          ']?' .
  57          ')';
  58  
  59  
  60      /**
  61       * Check if the given term contains Asian word characters
  62       *
  63       * @param string $term
  64       * @return bool
  65       */
  66      public static function isAsianWords($term)
  67      {
  68          return (bool)preg_match('/' . self::REGEXP . '/u', $term);
  69      }
  70  
  71      /**
  72       * Surround all Asian words in the given text with the given separator
  73       *
  74       * @param string $text Original text containing asian words
  75       * @param string $sep the separator to use
  76       * @return string Text with separated asian words
  77       */
  78      public static function separateAsianWords($text, $sep = ' ')
  79      {
  80          // handle asian chars as single words (may fail on older PHP version)
  81          $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text);
  82          if (!is_null($asia)) $text = $asia; // recover from regexp falure
  83  
  84          return $text;
  85      }
  86  
  87      /**
  88       * Split the given text into separate parts
  89       *
  90       * Each part is either a non-asian string, or a single asian word
  91       *
  92       * @param string $term
  93       * @return string[]
  94       */
  95      public static function splitAsianWords($term)
  96      {
  97          return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  98      }
  99  }