[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/Parsing/Lexer/ -> Lexer.php (source)

   1  <?php
   2  
   3  /**
   4   * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5   * For an intro to the Lexer see:
   6   * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
   7   *
   8   * @author Marcus Baker http://www.lastcraft.com
   9   */
  10  
  11  namespace dokuwiki\Parsing\Lexer;
  12  
  13  /**
  14   * Accepts text and breaks it into tokens.
  15   *
  16   * Some optimisation to make the sure the content is only scanned by the PHP regex
  17   * parser once. Lexer modes must not start with leading underscores.
  18   */
  19  class Lexer
  20  {
  21      /** @var ParallelRegex[] */
  22      protected $regexes = [];
  23      /** @var \Doku_Handler */
  24      protected $handler;
  25      /** @var StateStack */
  26      protected $modeStack;
  27      /** @var array mode "rewrites" */
  28      protected $mode_handlers = [];
  29      /** @var bool case sensitive? */
  30      protected $case;
  31  
  32      /**
  33       * Sets up the lexer in case insensitive matching by default.
  34       *
  35       * @param \Doku_Handler $handler  Handling strategy by reference.
  36       * @param string $start            Starting handler.
  37       * @param boolean $case            True for case sensitive.
  38       */
  39      public function __construct($handler, $start = "accept", $case = false)
  40      {
  41          $this->case = $case;
  42          $this->handler = $handler;
  43          $this->modeStack = new StateStack($start);
  44      }
  45  
  46      /**
  47       * Adds a token search pattern for a particular parsing mode.
  48       *
  49       * The pattern does not change the current mode.
  50       *
  51       * @param string $pattern      Perl style regex, but ( and )
  52       *                             lose the usual meaning.
  53       * @param string $mode         Should only apply this
  54       *                             pattern when dealing with
  55       *                             this type of input.
  56       */
  57      public function addPattern($pattern, $mode = "accept")
  58      {
  59          if (! isset($this->regexes[$mode])) {
  60              $this->regexes[$mode] = new ParallelRegex($this->case);
  61          }
  62          $this->regexes[$mode]->addPattern($pattern);
  63      }
  64  
  65      /**
  66       * Adds a pattern that will enter a new parsing mode.
  67       *
  68       * Useful for entering parenthesis, strings, tags, etc.
  69       *
  70       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  71       * @param string $mode         Should only apply this pattern when dealing with this type of input.
  72       * @param string $new_mode     Change parsing to this new nested mode.
  73       */
  74      public function addEntryPattern($pattern, $mode, $new_mode)
  75      {
  76          if (! isset($this->regexes[$mode])) {
  77              $this->regexes[$mode] = new ParallelRegex($this->case);
  78          }
  79          $this->regexes[$mode]->addPattern($pattern, $new_mode);
  80      }
  81  
  82      /**
  83       * Adds a pattern that will exit the current mode and re-enter the previous one.
  84       *
  85       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
  86       * @param string $mode         Mode to leave.
  87       */
  88      public function addExitPattern($pattern, $mode)
  89      {
  90          if (! isset($this->regexes[$mode])) {
  91              $this->regexes[$mode] = new ParallelRegex($this->case);
  92          }
  93          $this->regexes[$mode]->addPattern($pattern, "__exit");
  94      }
  95  
  96      /**
  97       * Adds a pattern that has a special mode.
  98       *
  99       * Acts as an entry and exit pattern in one go, effectively calling a special
 100       * parser handler for this token only.
 101       *
 102       * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 103       * @param string $mode         Should only apply this pattern when dealing with this type of input.
 104       * @param string $special      Use this mode for this one token.
 105       */
 106      public function addSpecialPattern($pattern, $mode, $special)
 107      {
 108          if (! isset($this->regexes[$mode])) {
 109              $this->regexes[$mode] = new ParallelRegex($this->case);
 110          }
 111          $this->regexes[$mode]->addPattern($pattern, "_$special");
 112      }
 113  
 114      /**
 115       * Adds a mapping from a mode to another handler.
 116       *
 117       * @param string $mode        Mode to be remapped.
 118       * @param string $handler     New target handler.
 119       */
 120      public function mapHandler($mode, $handler)
 121      {
 122          $this->mode_handlers[$mode] = $handler;
 123      }
 124  
 125      /**
 126       * Splits the page text into tokens.
 127       *
 128       * Will fail if the handlers report an error or if no content is consumed. If successful then each
 129       * unparsed and parsed token invokes a call to the held listener.
 130       *
 131       * @param string $raw        Raw HTML text.
 132       * @return boolean           True on success, else false.
 133       */
 134      public function parse($raw)
 135      {
 136          if (! isset($this->handler)) {
 137              return false;
 138          }
 139          $initialLength = strlen($raw);
 140          $length = $initialLength;
 141          $pos = 0;
 142          while (is_array($parsed = $this->reduce($raw))) {
 143              [$unmatched, $matched, $mode] = $parsed;
 144              $currentLength = strlen($raw);
 145              $matchPos = $initialLength - $currentLength - strlen($matched);
 146              if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 147                  return false;
 148              }
 149              if ($currentLength === $length) {
 150                  return false;
 151              }
 152              $length = $currentLength;
 153              $pos = $initialLength - $currentLength;
 154          }
 155          if (!$parsed) {
 156              return false;
 157          }
 158          return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
 159      }
 160  
 161      /**
 162       * Gives plugins access to the mode stack
 163       *
 164       * @return StateStack
 165       */
 166      public function getModeStack()
 167      {
 168          return $this->modeStack;
 169      }
 170  
 171      /**
 172       * Sends the matched token and any leading unmatched
 173       * text to the parser changing the lexer to a new
 174       * mode if one is listed.
 175       *
 176       * @param string $unmatched Unmatched leading portion.
 177       * @param string $matched Actual token match.
 178       * @param bool|string $mode Mode after match. A boolean false mode causes no change.
 179       * @param int $initialPos
 180       * @param int $matchPos Current byte index location in raw doc thats being parsed
 181       * @return boolean             False if there was any error from the parser.
 182       */
 183      protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
 184      {
 185          if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
 186              return false;
 187          }
 188          if ($this->isModeEnd($mode)) {
 189              if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
 190                  return false;
 191              }
 192              return $this->modeStack->leave();
 193          }
 194          if ($this->isSpecialMode($mode)) {
 195              $this->modeStack->enter($this->decodeSpecial($mode));
 196              if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 197                  return false;
 198              }
 199              return $this->modeStack->leave();
 200          }
 201          if (is_string($mode)) {
 202              $this->modeStack->enter($mode);
 203              return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
 204          }
 205          return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
 206      }
 207  
 208      /**
 209       * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
 210       * mode stack.
 211       *
 212       * @param string $mode    Mode to test.
 213       * @return boolean        True if this is the exit mode.
 214       */
 215      protected function isModeEnd($mode)
 216      {
 217          return ($mode === "__exit");
 218      }
 219  
 220      /**
 221       * Test to see if the mode is one where this mode is entered for this token only and automatically
 222       * leaves immediately afterwoods.
 223       *
 224       * @param string $mode    Mode to test.
 225       * @return boolean        True if this is the exit mode.
 226       */
 227      protected function isSpecialMode($mode)
 228      {
 229          return str_starts_with($mode, '_');
 230      }
 231  
 232      /**
 233       * Strips the magic underscore marking single token modes.
 234       *
 235       * @param string $mode    Mode to decode.
 236       * @return string         Underlying mode name.
 237       */
 238      protected function decodeSpecial($mode)
 239      {
 240          return substr($mode, 1);
 241      }
 242  
 243      /**
 244       * Calls the parser method named after the current mode.
 245       *
 246       * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
 247       *
 248       * @param string $content Text parsed.
 249       * @param boolean $is_match Token is recognised rather
 250       *                               than unparsed data.
 251       * @param int $pos Current byte index location in raw doc
 252       *                             thats being parsed
 253       * @return bool
 254       */
 255      protected function invokeHandler($content, $is_match, $pos)
 256      {
 257          if (($content === "") || ($content === false)) {
 258              return true;
 259          }
 260          $handler = $this->modeStack->getCurrent();
 261          if (isset($this->mode_handlers[$handler])) {
 262              $handler = $this->mode_handlers[$handler];
 263          }
 264  
 265          // modes starting with plugin_ are all handled by the same
 266          // handler but with an additional parameter
 267          if (str_starts_with($handler, 'plugin_')) {
 268              [$handler, $plugin] = sexplode('_', $handler, 2, '');
 269              return $this->handler->$handler($content, $is_match, $pos, $plugin);
 270          }
 271  
 272          return $this->handler->$handler($content, $is_match, $pos);
 273      }
 274  
 275      /**
 276       * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
 277       * unparsed data. Empty strings will not be matched.
 278       *
 279       * @param string $raw         The subject to parse. This is the content that will be eaten.
 280       * @return array|bool         Three item list of unparsed content followed by the
 281       *                            recognised token and finally the action the parser is to take.
 282       *                            True if no match, false if there is a parsing error.
 283       */
 284      protected function reduce(&$raw)
 285      {
 286          if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
 287              return false;
 288          }
 289          if ($raw === "") {
 290              return true;
 291          }
 292          if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
 293              [$unparsed, $match, $raw] = $split;
 294              return [$unparsed, $match, $action];
 295          }
 296          return true;
 297      }
 298  
 299      /**
 300       * Escapes regex characters other than (, ) and /
 301       *
 302       * @param string $str
 303       * @return string
 304       */
 305      public static function escape($str)
 306      {
 307          $chars = [
 308              '/\\\\/',
 309              '/\./',
 310              '/\+/',
 311              '/\*/',
 312              '/\?/',
 313              '/\[/',
 314              '/\^/',
 315              '/\]/',
 316              '/\$/',
 317              '/\{/',
 318              '/\}/',
 319              '/\=/',
 320              '/\!/',
 321              '/\</',
 322              '/\>/',
 323              '/\|/',
 324              '/\:/'
 325          ];
 326  
 327          $escaped = [
 328              '\\\\\\\\',
 329              '\.',
 330              '\+',
 331              '\*',
 332              '\?',
 333              '\[',
 334              '\^',
 335              '\]',
 336              '\$',
 337              '\{',
 338              '\}',
 339              '\=',
 340              '\!',
 341              '\<',
 342              '\>',
 343              '\|',
 344              '\:'
 345          ];
 346  
 347          return preg_replace($chars, $escaped, $str);
 348      }
 349  }