[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Parsing/Lexer/ -> Lexer.php (source)

   1  <?php
   2  /**
   3   * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   4   * For an intro to the Lexer see:
   5   * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
   6   *
   7   * @author Marcus Baker http://www.lastcraft.com
   8   */
   9  
  10  namespace dokuwiki\Parsing\Lexer;
  11  
  12  /**
  13   * Accepts text and breaks it into tokens.
  14   *
  15   * Some optimisation to make the sure the content is only scanned by the PHP regex
  16   * parser once. Lexer modes must not start with leading underscores.
  17   */
  18  class Lexer
  19  {
  20      /** @var ParallelRegex[] */
  21      protected $regexes;
  22      /** @var \Doku_Handler */
  23      protected $handler;
  24      /** @var StateStack */
  25      protected $modeStack;
  26      /** @var array mode "rewrites" */
  27      protected $mode_handlers;
  28      /** @var bool case sensitive? */
  29      protected $case;
  30  
  31      /**
  32       * Sets up the lexer in case insensitive matching by default.
  33       *
  34       * @param \Doku_Handler $handler  Handling strategy by reference.
  35       * @param string $start            Starting handler.
  36       * @param boolean $case            True for case sensitive.
  37       */
  38      public function __construct($handler, $start = "accept", $case = false)
  39      {
  40          $this->case = $case;
  41          $this->regexes = array();
  42          $this->handler = $handler;
  43          $this->modeStack = new StateStack($start);
  44          $this->mode_handlers = array();
  45      }
  46  
  47      /**
  48       * Adds a token search pattern for a particular parsing mode.
  49       *
  50       * The pattern does not change the current mode.
  51       *
  52       * @param string $pattern      Perl style regex, but ( and )
  53       *                             lose the usual meaning.
  54       * @param string $mode         Should only apply this
  55       *                             pattern when dealing with
  56       *                             this type of input.
  57       */
  58      public function addPattern($pattern, $mode = "accept")
  59      {
  60          if (! isset($this->regexes[$mode])) {
  61              $this->regexes[$mode] = new ParallelRegex($this->case);
  62          }
  63          $this->regexes[$mode]->addPattern($pattern);
  64      }
  65  
  66      /**
  67       * Adds a pattern that will enter a new parsing mode.
  68       *
  69       * Useful for entering parenthesis, strings, tags, etc.
  70       *
  71       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  72       * @param string $mode         Should only apply this pattern when dealing with this type of input.
  73       * @param string $new_mode     Change parsing to this new nested mode.
  74       */
  75      public function addEntryPattern($pattern, $mode, $new_mode)
  76      {
  77          if (! isset($this->regexes[$mode])) {
  78              $this->regexes[$mode] = new ParallelRegex($this->case);
  79          }
  80          $this->regexes[$mode]->addPattern($pattern, $new_mode);
  81      }
  82  
  83      /**
  84       * Adds a pattern that will exit the current mode and re-enter the previous one.
  85       *
  86       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  87       * @param string $mode         Mode to leave.
  88       */
  89      public function addExitPattern($pattern, $mode)
  90      {
  91          if (! isset($this->regexes[$mode])) {
  92              $this->regexes[$mode] = new ParallelRegex($this->case);
  93          }
  94          $this->regexes[$mode]->addPattern($pattern, "__exit");
  95      }
  96  
  97      /**
  98       * Adds a pattern that has a special mode.
  99       *
 100       * Acts as an entry and exit pattern in one go, effectively calling a special
 101       * parser handler for this token only.
 102       *
 103       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 104       * @param string $mode         Should only apply this pattern when dealing with this type of input.
 105       * @param string $special      Use this mode for this one token.
 106       */
 107      public function addSpecialPattern($pattern, $mode, $special)
 108      {
 109          if (! isset($this->regexes[$mode])) {
 110              $this->regexes[$mode] = new ParallelRegex($this->case);
 111          }
 112          $this->regexes[$mode]->addPattern($pattern, "_$special");
 113      }
 114  
 115      /**
 116       * Adds a mapping from a mode to another handler.
 117       *
 118       * @param string $mode        Mode to be remapped.
 119       * @param string $handler     New target handler.
 120       */
 121      public function mapHandler($mode, $handler)
 122      {
 123          $this->mode_handlers[$mode] = $handler;
 124      }
 125  
 126      /**
 127       * Splits the page text into tokens.
 128       *
 129       * Will fail if the handlers report an error or if no content is consumed. If successful then each
 130       * unparsed and parsed token invokes a call to the held listener.
 131       *
 132       * @param string $raw        Raw HTML text.
 133       * @return boolean           True on success, else false.
 134       */
 135      public function parse($raw)
 136      {
 137          if (! isset($this->handler)) {
 138              return false;
 139          }
 140          $initialLength = strlen($raw);
 141          $length = $initialLength;
 142          $pos = 0;
 143          while (is_array($parsed = $this->reduce($raw))) {
 144              list($unmatched, $matched, $mode) = $parsed;
 145              $currentLength = strlen($raw);
 146              $matchPos = $initialLength - $currentLength - strlen($matched);
 147              if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 148                  return false;
 149              }
 150              if ($currentLength == $length) {
 151                  return false;
 152              }
 153              $length = $currentLength;
 154              $pos = $initialLength - $currentLength;
 155          }
 156          if (!$parsed) {
 157              return false;
 158          }
 159          return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
 160      }
 161  
 162      /**
 163       * Gives plugins access to the mode stack
 164       *
 165       * @return StateStack
 166       */
 167      public function getModeStack()
 168      {
 169          return $this->modeStack;
 170      }
 171  
 172      /**
 173       * Sends the matched token and any leading unmatched
 174       * text to the parser changing the lexer to a new
 175       * mode if one is listed.
 176       *
 177       * @param string $unmatched Unmatched leading portion.
 178       * @param string $matched Actual token match.
 179       * @param bool|string $mode Mode after match. A boolean false mode causes no change.
 180       * @param int $initialPos
 181       * @param int $matchPos Current byte index location in raw doc thats being parsed
 182       * @return boolean             False if there was any error from the parser.
 183       */
 184      protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
 185      {
 186          if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
 187              return false;
 188          }
 189          if ($this->isModeEnd($mode)) {
 190              if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
 191                  return false;
 192              }
 193              return $this->modeStack->leave();
 194          }
 195          if ($this->isSpecialMode($mode)) {
 196              $this->modeStack->enter($this->decodeSpecial($mode));
 197              if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 198                  return false;
 199              }
 200              return $this->modeStack->leave();
 201          }
 202          if (is_string($mode)) {
 203              $this->modeStack->enter($mode);
 204              return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
 205          }
 206          return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
 207      }
 208  
 209      /**
 210       * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
 211       * mode stack.
 212       *
 213       * @param string $mode    Mode to test.
 214       * @return boolean        True if this is the exit mode.
 215       */
 216      protected function isModeEnd($mode)
 217      {
 218          return ($mode === "__exit");
 219      }
 220  
 221      /**
 222       * Test to see if the mode is one where this mode is entered for this token only and automatically
 223       * leaves immediately afterwoods.
 224       *
 225       * @param string $mode    Mode to test.
 226       * @return boolean        True if this is the exit mode.
 227       */
 228      protected function isSpecialMode($mode)
 229      {
 230          return (strncmp($mode, "_", 1) == 0);
 231      }
 232  
 233      /**
 234       * Strips the magic underscore marking single token modes.
 235       *
 236       * @param string $mode    Mode to decode.
 237       * @return string         Underlying mode name.
 238       */
 239      protected function decodeSpecial($mode)
 240      {
 241          return substr($mode, 1);
 242      }
 243  
 244      /**
 245       * Calls the parser method named after the current mode.
 246       *
 247       * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
 248       *
 249       * @param string $content Text parsed.
 250       * @param boolean $is_match Token is recognised rather
 251       *                               than unparsed data.
 252       * @param int $pos Current byte index location in raw doc
 253       *                             thats being parsed
 254       * @return bool
 255       */
 256      protected function invokeHandler($content, $is_match, $pos)
 257      {
 258          if (($content === "") || ($content === false)) {
 259              return true;
 260          }
 261          $handler = $this->modeStack->getCurrent();
 262          if (isset($this->mode_handlers[$handler])) {
 263              $handler = $this->mode_handlers[$handler];
 264          }
 265  
 266          // modes starting with plugin_ are all handled by the same
 267          // handler but with an additional parameter
 268          if (substr($handler, 0, 7)=='plugin_') {
 269              list($handler,$plugin) = sexplode('_', $handler, 2, '');
 270              return $this->handler->$handler($content, $is_match, $pos, $plugin);
 271          }
 272  
 273          return $this->handler->$handler($content, $is_match, $pos);
 274      }
 275  
 276      /**
 277       * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
 278       * unparsed data. Empty strings will not be matched.
 279       *
 280       * @param string $raw         The subject to parse. This is the content that will be eaten.
 281       * @return array|bool         Three item list of unparsed content followed by the
 282       *                            recognised token and finally the action the parser is to take.
 283       *                            True if no match, false if there is a parsing error.
 284       */
 285      protected function reduce(&$raw)
 286      {
 287          if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
 288              return false;
 289          }
 290          if ($raw === "") {
 291              return true;
 292          }
 293          if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
 294              list($unparsed, $match, $raw) = $split;
 295              return array($unparsed, $match, $action);
 296          }
 297          return true;
 298      }
 299  
 300      /**
 301       * Escapes regex characters other than (, ) and /
 302       *
 303       * @param string $str
 304       * @return string
 305       */
 306      public static function escape($str)
 307      {
 308          $chars = array(
 309              '/\\\\/',
 310              '/\./',
 311              '/\+/',
 312              '/\*/',
 313              '/\?/',
 314              '/\[/',
 315              '/\^/',
 316              '/\]/',
 317              '/\$/',
 318              '/\{/',
 319              '/\}/',
 320              '/\=/',
 321              '/\!/',
 322              '/\</',
 323              '/\>/',
 324              '/\|/',
 325              '/\:/'
 326          );
 327  
 328          $escaped = array(
 329              '\\\\\\\\',
 330              '\.',
 331              '\+',
 332              '\*',
 333              '\?',
 334              '\[',
 335              '\^',
 336              '\]',
 337              '\$',
 338              '\{',
 339              '\}',
 340              '\=',
 341              '\!',
 342              '\<',
 343              '\>',
 344              '\|',
 345              '\:'
 346          );
 347          return preg_replace($chars, $escaped, $str);
 348      }
 349  }