[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Parsing/Lexer/ -> Lexer.php (source)

   1  <?php
   2  /**
   3   * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   4   * For an intro to the Lexer see:
   5   * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
   6   *
   7   * @author Marcus Baker http://www.lastcraft.com
   8   */
   9  
  10  namespace dokuwiki\Parsing\Lexer;
  11  
  12  // FIXME move elsewhere
  13  
  14  define("DOKU_LEXER_ENTER", 1);
  15  define("DOKU_LEXER_MATCHED", 2);
  16  define("DOKU_LEXER_UNMATCHED", 3);
  17  define("DOKU_LEXER_EXIT", 4);
  18  define("DOKU_LEXER_SPECIAL", 5);
  19  
  20  /**
  21   * Accepts text and breaks it into tokens.
  22   *
  23   * Some optimisation to make the sure the content is only scanned by the PHP regex
  24   * parser once. Lexer modes must not start with leading underscores.
  25   */
  26  class Lexer
  27  {
  28      /** @var ParallelRegex[] */
  29      protected $regexes;
  30      /** @var \Doku_Handler */
  31      protected $handler;
  32      /** @var StateStack */
  33      protected $modeStack;
  34      /** @var array mode "rewrites" */
  35      protected $mode_handlers;
  36      /** @var bool case sensitive? */
  37      protected $case;
  38  
  39      /**
  40       * Sets up the lexer in case insensitive matching by default.
  41       *
  42       * @param \Doku_Handler $handler  Handling strategy by reference.
  43       * @param string $start            Starting handler.
  44       * @param boolean $case            True for case sensitive.
  45       */
  46      public function __construct($handler, $start = "accept", $case = false)
  47      {
  48          $this->case = $case;
  49          $this->regexes = array();
  50          $this->handler = $handler;
  51          $this->modeStack = new StateStack($start);
  52          $this->mode_handlers = array();
  53      }
  54  
  55      /**
  56       * Adds a token search pattern for a particular parsing mode.
  57       *
  58       * The pattern does not change the current mode.
  59       *
  60       * @param string $pattern      Perl style regex, but ( and )
  61       *                             lose the usual meaning.
  62       * @param string $mode         Should only apply this
  63       *                             pattern when dealing with
  64       *                             this type of input.
  65       */
  66      public function addPattern($pattern, $mode = "accept")
  67      {
  68          if (! isset($this->regexes[$mode])) {
  69              $this->regexes[$mode] = new ParallelRegex($this->case);
  70          }
  71          $this->regexes[$mode]->addPattern($pattern);
  72      }
  73  
  74      /**
  75       * Adds a pattern that will enter a new parsing mode.
  76       *
  77       * Useful for entering parenthesis, strings, tags, etc.
  78       *
  79       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  80       * @param string $mode         Should only apply this pattern when dealing with this type of input.
  81       * @param string $new_mode     Change parsing to this new nested mode.
  82       */
  83      public function addEntryPattern($pattern, $mode, $new_mode)
  84      {
  85          if (! isset($this->regexes[$mode])) {
  86              $this->regexes[$mode] = new ParallelRegex($this->case);
  87          }
  88          $this->regexes[$mode]->addPattern($pattern, $new_mode);
  89      }
  90  
  91      /**
  92       * Adds a pattern that will exit the current mode and re-enter the previous one.
  93       *
  94       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  95       * @param string $mode         Mode to leave.
  96       */
  97      public function addExitPattern($pattern, $mode)
  98      {
  99          if (! isset($this->regexes[$mode])) {
 100              $this->regexes[$mode] = new ParallelRegex($this->case);
 101          }
 102          $this->regexes[$mode]->addPattern($pattern, "__exit");
 103      }
 104  
 105      /**
 106       * Adds a pattern that has a special mode.
 107       *
 108       * Acts as an entry and exit pattern in one go, effectively calling a special
 109       * parser handler for this token only.
 110       *
 111       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 112       * @param string $mode         Should only apply this pattern when dealing with this type of input.
 113       * @param string $special      Use this mode for this one token.
 114       */
 115      public function addSpecialPattern($pattern, $mode, $special)
 116      {
 117          if (! isset($this->regexes[$mode])) {
 118              $this->regexes[$mode] = new ParallelRegex($this->case);
 119          }
 120          $this->regexes[$mode]->addPattern($pattern, "_$special");
 121      }
 122  
 123      /**
 124       * Adds a mapping from a mode to another handler.
 125       *
 126       * @param string $mode        Mode to be remapped.
 127       * @param string $handler     New target handler.
 128       */
 129      public function mapHandler($mode, $handler)
 130      {
 131          $this->mode_handlers[$mode] = $handler;
 132      }
 133  
 134      /**
 135       * Splits the page text into tokens.
 136       *
 137       * Will fail if the handlers report an error or if no content is consumed. If successful then each
 138       * unparsed and parsed token invokes a call to the held listener.
 139       *
 140       * @param string $raw        Raw HTML text.
 141       * @return boolean           True on success, else false.
 142       */
 143      public function parse($raw)
 144      {
 145          if (! isset($this->handler)) {
 146              return false;
 147          }
 148          $initialLength = strlen($raw);
 149          $length = $initialLength;
 150          $pos = 0;
 151          while (is_array($parsed = $this->reduce($raw))) {
 152              list($unmatched, $matched, $mode) = $parsed;
 153              $currentLength = strlen($raw);
 154              $matchPos = $initialLength - $currentLength - strlen($matched);
 155              if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 156                  return false;
 157              }
 158              if ($currentLength == $length) {
 159                  return false;
 160              }
 161              $length = $currentLength;
 162              $pos = $initialLength - $currentLength;
 163          }
 164          if (!$parsed) {
 165              return false;
 166          }
 167          return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
 168      }
 169  
 170      /**
 171       * Gives plugins access to the mode stack
 172       *
 173       * @return StateStack
 174       */
 175      public function getModeStack()
 176      {
 177          return $this->modeStack;
 178      }
 179  
 180      /**
 181       * Sends the matched token and any leading unmatched
 182       * text to the parser changing the lexer to a new
 183       * mode if one is listed.
 184       *
 185       * @param string $unmatched Unmatched leading portion.
 186       * @param string $matched Actual token match.
 187       * @param bool|string $mode Mode after match. A boolean false mode causes no change.
 188       * @param int $initialPos
 189       * @param int $matchPos Current byte index location in raw doc thats being parsed
 190       * @return boolean             False if there was any error from the parser.
 191       */
 192      protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
 193      {
 194          if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
 195              return false;
 196          }
 197          if ($this->isModeEnd($mode)) {
 198              if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
 199                  return false;
 200              }
 201              return $this->modeStack->leave();
 202          }
 203          if ($this->isSpecialMode($mode)) {
 204              $this->modeStack->enter($this->decodeSpecial($mode));
 205              if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 206                  return false;
 207              }
 208              return $this->modeStack->leave();
 209          }
 210          if (is_string($mode)) {
 211              $this->modeStack->enter($mode);
 212              return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
 213          }
 214          return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
 215      }
 216  
 217      /**
 218       * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
 219       * mode stack.
 220       *
 221       * @param string $mode    Mode to test.
 222       * @return boolean        True if this is the exit mode.
 223       */
 224      protected function isModeEnd($mode)
 225      {
 226          return ($mode === "__exit");
 227      }
 228  
 229      /**
 230       * Test to see if the mode is one where this mode is entered for this token only and automatically
 231       * leaves immediately afterwoods.
 232       *
 233       * @param string $mode    Mode to test.
 234       * @return boolean        True if this is the exit mode.
 235       */
 236      protected function isSpecialMode($mode)
 237      {
 238          return (strncmp($mode, "_", 1) == 0);
 239      }
 240  
 241      /**
 242       * Strips the magic underscore marking single token modes.
 243       *
 244       * @param string $mode    Mode to decode.
 245       * @return string         Underlying mode name.
 246       */
 247      protected function decodeSpecial($mode)
 248      {
 249          return substr($mode, 1);
 250      }
 251  
 252      /**
 253       * Calls the parser method named after the current mode.
 254       *
 255       * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
 256       *
 257       * @param string $content Text parsed.
 258       * @param boolean $is_match Token is recognised rather
 259       *                               than unparsed data.
 260       * @param int $pos Current byte index location in raw doc
 261       *                             thats being parsed
 262       * @return bool
 263       */
 264      protected function invokeHandler($content, $is_match, $pos)
 265      {
 266          if (($content === "") || ($content === false)) {
 267              return true;
 268          }
 269          $handler = $this->modeStack->getCurrent();
 270          if (isset($this->mode_handlers[$handler])) {
 271              $handler = $this->mode_handlers[$handler];
 272          }
 273  
 274          // modes starting with plugin_ are all handled by the same
 275          // handler but with an additional parameter
 276          if (substr($handler, 0, 7)=='plugin_') {
 277              list($handler,$plugin) = explode('_', $handler, 2);
 278              return $this->handler->$handler($content, $is_match, $pos, $plugin);
 279          }
 280  
 281          return $this->handler->$handler($content, $is_match, $pos);
 282      }
 283  
 284      /**
 285       * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
 286       * unparsed data. Empty strings will not be matched.
 287       *
 288       * @param string $raw         The subject to parse. This is the content that will be eaten.
 289       * @return array|bool         Three item list of unparsed content followed by the
 290       *                            recognised token and finally the action the parser is to take.
 291       *                            True if no match, false if there is a parsing error.
 292       */
 293      protected function reduce(&$raw)
 294      {
 295          if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
 296              return false;
 297          }
 298          if ($raw === "") {
 299              return true;
 300          }
 301          if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
 302              list($unparsed, $match, $raw) = $split;
 303              return array($unparsed, $match, $action);
 304          }
 305          return true;
 306      }
 307  
 308      /**
 309       * Escapes regex characters other than (, ) and /
 310       *
 311       * @param string $str
 312       * @return string
 313       */
 314      public static function escape($str)
 315      {
 316          $chars = array(
 317              '/\\\\/',
 318              '/\./',
 319              '/\+/',
 320              '/\*/',
 321              '/\?/',
 322              '/\[/',
 323              '/\^/',
 324              '/\]/',
 325              '/\$/',
 326              '/\{/',
 327              '/\}/',
 328              '/\=/',
 329              '/\!/',
 330              '/\</',
 331              '/\>/',
 332              '/\|/',
 333              '/\:/'
 334          );
 335  
 336          $escaped = array(
 337              '\\\\\\\\',
 338              '\.',
 339              '\+',
 340              '\*',
 341              '\?',
 342              '\[',
 343              '\^',
 344              '\]',
 345              '\$',
 346              '\{',
 347              '\}',
 348              '\=',
 349              '\!',
 350              '\<',
 351              '\>',
 352              '\|',
 353              '\:'
 354          );
 355          return preg_replace($chars, $escaped, $str);
 356      }
 357  }