[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Parsing/Lexer/ -> Lexer.php (source)

   1  <?php
   2  /**
   3   * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   4   * For an intro to the Lexer see:
   5   * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
   6   *
   7   * @author Marcus Baker http://www.lastcraft.com
   8   */
   9  
  10  namespace dokuwiki\Parsing\Lexer;
  11  
  12  // FIXME move elsewhere
  13  
  14  define("DOKU_LEXER_ENTER", 1);
  15  define("DOKU_LEXER_MATCHED", 2);
  16  define("DOKU_LEXER_UNMATCHED", 3);
  17  define("DOKU_LEXER_EXIT", 4);
  18  define("DOKU_LEXER_SPECIAL", 5);
  19  
  20  /**
  21   * Accepts text and breaks it into tokens.
  22   *
  23   * Some optimisation to make the sure the content is only scanned by the PHP regex
  24   * parser once. Lexer modes must not start with leading underscores.
  25   */
  26  class Lexer
  27  {
  28      /** @var ParallelRegex[] */
  29      protected $regexes;
  30      /** @var \Doku_Handler */
  31      protected $handler;
  32      /** @var StateStack */
  33      protected $modeStack;
  34      /** @var array mode "rewrites" */
  35      protected $mode_handlers;
  36      /** @var bool case sensitive? */
  37      protected $case;
  38  
  39      /**
  40       * Sets up the lexer in case insensitive matching by default.
  41       *
  42       * @param \Doku_Handler $handler  Handling strategy by reference.
  43       * @param string $start            Starting handler.
  44       * @param boolean $case            True for case sensitive.
  45       */
  46      public function __construct($handler, $start = "accept", $case = false)
  47      {
  48          $this->case = $case;
  49          $this->regexes = array();
  50          $this->handler = $handler;
  51          $this->modeStack = new StateStack($start);
  52          $this->mode_handlers = array();
  53      }
  54  
  55      /**
  56       * Adds a token search pattern for a particular parsing mode.
  57       *
  58       * The pattern does not change the current mode.
  59       *
  60       * @param string $pattern      Perl style regex, but ( and )
  61       *                             lose the usual meaning.
  62       * @param string $mode         Should only apply this
  63       *                             pattern when dealing with
  64       *                             this type of input.
  65       */
  66      public function addPattern($pattern, $mode = "accept")
  67      {
  68          if (! isset($this->regexes[$mode])) {
  69              $this->regexes[$mode] = new ParallelRegex($this->case);
  70          }
  71          $this->regexes[$mode]->addPattern($pattern);
  72      }
  73  
  74      /**
  75       * Adds a pattern that will enter a new parsing mode.
  76       *
  77       * Useful for entering parenthesis, strings, tags, etc.
  78       *
  79       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  80       * @param string $mode         Should only apply this pattern when dealing with this type of input.
  81       * @param string $new_mode     Change parsing to this new nested mode.
  82       */
  83      public function addEntryPattern($pattern, $mode, $new_mode)
  84      {
  85          if (! isset($this->regexes[$mode])) {
  86              $this->regexes[$mode] = new ParallelRegex($this->case);
  87          }
  88          $this->regexes[$mode]->addPattern($pattern, $new_mode);
  89      }
  90  
  91      /**
  92       * Adds a pattern that will exit the current mode and re-enter the previous one.
  93       *
  94       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  95       * @param string $mode         Mode to leave.
  96       */
  97      public function addExitPattern($pattern, $mode)
  98      {
  99          if (! isset($this->regexes[$mode])) {
 100              $this->regexes[$mode] = new ParallelRegex($this->case);
 101          }
 102          $this->regexes[$mode]->addPattern($pattern, "__exit");
 103      }
 104  
 105      /**
 106       * Adds a pattern that has a special mode.
 107       *
 108       * Acts as an entry and exit pattern in one go, effectively calling a special
 109       * parser handler for this token only.
 110       *
 111       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 112       * @param string $mode         Should only apply this pattern when dealing with this type of input.
 113       * @param string $special      Use this mode for this one token.
 114       */
 115      public function addSpecialPattern($pattern, $mode, $special)
 116      {
 117          if (! isset($this->regexes[$mode])) {
 118              $this->regexes[$mode] = new ParallelRegex($this->case);
 119          }
 120          $this->regexes[$mode]->addPattern($pattern, "_$special");
 121      }
 122  
 123      /**
 124       * Adds a mapping from a mode to another handler.
 125       *
 126       * @param string $mode        Mode to be remapped.
 127       * @param string $handler     New target handler.
 128       */
 129      public function mapHandler($mode, $handler)
 130      {
 131          $this->mode_handlers[$mode] = $handler;
 132      }
 133  
 134      /**
 135       * Splits the page text into tokens.
 136       *
 137       * Will fail if the handlers report an error or if no content is consumed. If successful then each
 138       * unparsed and parsed token invokes a call to the held listener.
 139       *
 140       * @param string $raw        Raw HTML text.
 141       * @return boolean           True on success, else false.
 142       */
 143      public function parse($raw)
 144      {
 145          if (! isset($this->handler)) {
 146              return false;
 147          }
 148          $initialLength = strlen($raw);
 149          $length = $initialLength;
 150          $pos = 0;
 151          while (is_array($parsed = $this->reduce($raw))) {
 152              list($unmatched, $matched, $mode) = $parsed;
 153              $currentLength = strlen($raw);
 154              $matchPos = $initialLength - $currentLength - strlen($matched);
 155              if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 156                  return false;
 157              }
 158              if ($currentLength == $length) {
 159                  return false;
 160              }
 161              $length = $currentLength;
 162              $pos = $initialLength - $currentLength;
 163          }
 164          if (!$parsed) {
 165              return false;
 166          }
 167          return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
 168      }
 169  
 170      /**
 171       * Sends the matched token and any leading unmatched
 172       * text to the parser changing the lexer to a new
 173       * mode if one is listed.
 174       *
 175       * @param string $unmatched Unmatched leading portion.
 176       * @param string $matched Actual token match.
 177       * @param bool|string $mode Mode after match. A boolean false mode causes no change.
 178       * @param int $initialPos
 179       * @param int $matchPos Current byte index location in raw doc thats being parsed
 180       * @return boolean             False if there was any error from the parser.
 181       */
 182      protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
 183      {
 184          if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
 185              return false;
 186          }
 187          if ($this->isModeEnd($mode)) {
 188              if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
 189                  return false;
 190              }
 191              return $this->modeStack->leave();
 192          }
 193          if ($this->isSpecialMode($mode)) {
 194              $this->modeStack->enter($this->decodeSpecial($mode));
 195              if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 196                  return false;
 197              }
 198              return $this->modeStack->leave();
 199          }
 200          if (is_string($mode)) {
 201              $this->modeStack->enter($mode);
 202              return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
 203          }
 204          return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
 205      }
 206  
 207      /**
 208       * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
 209       * mode stack.
 210       *
 211       * @param string $mode    Mode to test.
 212       * @return boolean        True if this is the exit mode.
 213       */
 214      protected function isModeEnd($mode)
 215      {
 216          return ($mode === "__exit");
 217      }
 218  
 219      /**
 220       * Test to see if the mode is one where this mode is entered for this token only and automatically
 221       * leaves immediately afterwoods.
 222       *
 223       * @param string $mode    Mode to test.
 224       * @return boolean        True if this is the exit mode.
 225       */
 226      protected function isSpecialMode($mode)
 227      {
 228          return (strncmp($mode, "_", 1) == 0);
 229      }
 230  
 231      /**
 232       * Strips the magic underscore marking single token modes.
 233       *
 234       * @param string $mode    Mode to decode.
 235       * @return string         Underlying mode name.
 236       */
 237      protected function decodeSpecial($mode)
 238      {
 239          return substr($mode, 1);
 240      }
 241  
 242      /**
 243       * Calls the parser method named after the current mode.
 244       *
 245       * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
 246       *
 247       * @param string $content Text parsed.
 248       * @param boolean $is_match Token is recognised rather
 249       *                               than unparsed data.
 250       * @param int $pos Current byte index location in raw doc
 251       *                             thats being parsed
 252       * @return bool
 253       */
 254      protected function invokeHandler($content, $is_match, $pos)
 255      {
 256          if (($content === "") || ($content === false)) {
 257              return true;
 258          }
 259          $handler = $this->modeStack->getCurrent();
 260          if (isset($this->mode_handlers[$handler])) {
 261              $handler = $this->mode_handlers[$handler];
 262          }
 263  
 264          // modes starting with plugin_ are all handled by the same
 265          // handler but with an additional parameter
 266          if (substr($handler, 0, 7)=='plugin_') {
 267              list($handler,$plugin) = explode('_', $handler, 2);
 268              return $this->handler->$handler($content, $is_match, $pos, $plugin);
 269          }
 270  
 271          return $this->handler->$handler($content, $is_match, $pos);
 272      }
 273  
 274      /**
 275       * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
 276       * unparsed data. Empty strings will not be matched.
 277       *
 278       * @param string $raw         The subject to parse. This is the content that will be eaten.
 279       * @return array|bool         Three item list of unparsed content followed by the
 280       *                            recognised token and finally the action the parser is to take.
 281       *                            True if no match, false if there is a parsing error.
 282       */
 283      protected function reduce(&$raw)
 284      {
 285          if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
 286              return false;
 287          }
 288          if ($raw === "") {
 289              return true;
 290          }
 291          if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
 292              list($unparsed, $match, $raw) = $split;
 293              return array($unparsed, $match, $action);
 294          }
 295          return true;
 296      }
 297  
 298      /**
 299       * Escapes regex characters other than (, ) and /
 300       *
 301       * @param string $str
 302       * @return string
 303       */
 304      public static function escape($str)
 305      {
 306          $chars = array(
 307              '/\\\\/',
 308              '/\./',
 309              '/\+/',
 310              '/\*/',
 311              '/\?/',
 312              '/\[/',
 313              '/\^/',
 314              '/\]/',
 315              '/\$/',
 316              '/\{/',
 317              '/\}/',
 318              '/\=/',
 319              '/\!/',
 320              '/\</',
 321              '/\>/',
 322              '/\|/',
 323              '/\:/'
 324          );
 325  
 326          $escaped = array(
 327              '\\\\\\\\',
 328              '\.',
 329              '\+',
 330              '\*',
 331              '\?',
 332              '\[',
 333              '\^',
 334              '\]',
 335              '\$',
 336              '\{',
 337              '\}',
 338              '\=',
 339              '\!',
 340              '\<',
 341              '\>',
 342              '\|',
 343              '\:'
 344          );
 345          return preg_replace($chars, $escaped, $str);
 346      }
 347  }