[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/parser/ -> lexer.php (source)

   1  <?php
   2  /**
   3   * Author Markus Baker: http://www.lastcraft.com
   4   * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5   * For an intro to the Lexer see:
   6   * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
   7   * @author Marcus Baker
   8   * @package Doku
   9   * @subpackage Lexer
  10   * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
  11   */
  12  
  13  /**
  14   * Init path constant
  15   */
  16  if(!defined('DOKU_INC')) die('meh.');
  17  
  18  /**#@+
  19   * lexer mode constant
  20   */
  21  define("DOKU_LEXER_ENTER", 1);
  22  define("DOKU_LEXER_MATCHED", 2);
  23  define("DOKU_LEXER_UNMATCHED", 3);
  24  define("DOKU_LEXER_EXIT", 4);
  25  define("DOKU_LEXER_SPECIAL", 5);
  26  /**#@-*/
  27  
  28  /**
  29   * Compounded regular expression. Any of
  30   * the contained patterns could match and
  31   * when one does it's label is returned.
  32   *
  33   * @package Doku
  34   * @subpackage Lexer
  35   */
  36  class Doku_LexerParallelRegex {
  37      var $_patterns;
  38      var $_labels;
  39      var $_regex;
  40      var $_case;
  41  
  42      /**
  43       * Constructor. Starts with no patterns.
  44       *
  45       * @param boolean $case    True for case sensitive, false
  46       *                         for insensitive.
  47       * @access public
  48       */
  49      function __construct($case) {
  50          $this->_case = $case;
  51          $this->_patterns = array();
  52          $this->_labels = array();
  53          $this->_regex = null;
  54      }
  55  
  56      /**
  57       * Adds a pattern with an optional label.
  58       *
  59       * @param mixed       $pattern Perl style regex. Must be UTF-8
  60       *                             encoded. If its a string, the (, )
  61       *                             lose their meaning unless they
  62       *                             form part of a lookahead or
  63       *                             lookbehind assertation.
  64       * @param bool|string $label   Label of regex to be returned
  65       *                             on a match. Label must be ASCII
  66       * @access public
  67       */
  68      function addPattern($pattern, $label = true) {
  69          $count = count($this->_patterns);
  70          $this->_patterns[$count] = $pattern;
  71          $this->_labels[$count] = $label;
  72          $this->_regex = null;
  73      }
  74  
  75      /**
  76       * Attempts to match all patterns at once against a string.
  77       *
  78       * @param string $subject      String to match against.
  79       * @param string $match        First matched portion of
  80       *                             subject.
  81       * @return boolean             True on success.
  82       * @access public
  83       */
  84      function match($subject, &$match) {
  85          if (count($this->_patterns) == 0) {
  86              return false;
  87          }
  88          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  89              $match = "";
  90              return false;
  91          }
  92  
  93          $match = $matches[0];
  94          $size = count($matches);
  95          for ($i = 1; $i < $size; $i++) {
  96              if ($matches[$i] && isset($this->_labels[$i - 1])) {
  97                  return $this->_labels[$i - 1];
  98              }
  99          }
 100          return true;
 101      }
 102  
 103      /**
 104       * Attempts to split the string against all patterns at once
 105       *
 106       * @param string $subject      String to match against.
 107       * @param array $split         The split result: array containing, pre-match, match & post-match strings
 108       * @return boolean             True on success.
 109       * @access public
 110       *
 111       * @author Christopher Smith <chris@jalakai.co.uk>
 112       */
 113      function split($subject, &$split) {
 114          if (count($this->_patterns) == 0) {
 115              return false;
 116          }
 117  
 118          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
 119              if(function_exists('preg_last_error')){
 120                  $err = preg_last_error();
 121                  switch($err){
 122                      case PREG_BACKTRACK_LIMIT_ERROR:
 123                          msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
 124                          break;
 125                      case PREG_RECURSION_LIMIT_ERROR:
 126                          msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
 127                          break;
 128                      case PREG_BAD_UTF8_ERROR:
 129                          msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
 130                          break;
 131                      case PREG_INTERNAL_ERROR:
 132                          msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
 133                          break;
 134                  }
 135              }
 136  
 137              $split = array($subject, "", "");
 138              return false;
 139          }
 140  
 141          $idx = count($matches)-2;
 142          list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
 143          $split = array($pre, $matches[0], $post);
 144  
 145          return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
 146      }
 147  
 148      /**
 149       * Compounds the patterns into a single
 150       * regular expression separated with the
 151       * "or" operator. Caches the regex.
 152       * Will automatically escape (, ) and / tokens.
 153       *
 154       * @internal array $_patterns List of patterns in order.
 155       * @return null|string
 156       * @access private
 157       */
 158      function _getCompoundedRegex() {
 159          if ($this->_regex == null) {
 160              $cnt = count($this->_patterns);
 161              for ($i = 0; $i < $cnt; $i++) {
 162  
 163                  /*
 164                   * decompose the input pattern into "(", "(?", ")",
 165                   * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
 166                   * elements.
 167                   */
 168                  preg_match_all('/\\\\.|' .
 169                                 '\(\?|' .
 170                                 '[()]|' .
 171                                 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
 172                                 '[^[()\\\\]+/', $this->_patterns[$i], $elts);
 173  
 174                  $pattern = "";
 175                  $level = 0;
 176  
 177                  foreach ($elts[0] as $elt) {
 178                      /*
 179                       * for "(", ")" remember the nesting level, add "\"
 180                       * only to the non-"(?" ones.
 181                       */
 182  
 183                      switch($elt) {
 184                          case '(':
 185                              $pattern .= '\(';
 186                              break;
 187                          case ')':
 188                              if ($level > 0)
 189                                  $level--; /* closing (? */
 190                              else
 191                                  $pattern .= '\\';
 192                              $pattern .= ')';
 193                              break;
 194                          case '(?':
 195                              $level++;
 196                              $pattern .= '(?';
 197                              break;
 198                          default:
 199                              if (substr($elt, 0, 1) == '\\')
 200                                  $pattern .= $elt;
 201                              else
 202                                  $pattern .= str_replace('/', '\/', $elt);
 203                      }
 204                  }
 205                  $this->_patterns[$i] = "($pattern)";
 206              }
 207              $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 208          }
 209          return $this->_regex;
 210      }
 211  
 212      /**
 213       * Accessor for perl regex mode flags to use.
 214       * @return string       Perl regex flags.
 215       * @access private
 216       */
 217      function _getPerlMatchingFlags() {
 218          return ($this->_case ? "msS" : "msSi");
 219      }
 220  }
 221  
 222  /**
 223   * States for a stack machine.
 224   * @package Lexer
 225   * @subpackage Lexer
 226   */
 227  class Doku_LexerStateStack {
 228      var $_stack;
 229  
 230      /**
 231       * Constructor. Starts in named state.
 232       * @param string $start        Starting state name.
 233       * @access public
 234       */
 235      function __construct($start) {
 236          $this->_stack = array($start);
 237      }
 238  
 239      /**
 240       * Accessor for current state.
 241       * @return string       State.
 242       * @access public
 243       */
 244      function getCurrent() {
 245          return $this->_stack[count($this->_stack) - 1];
 246      }
 247  
 248      /**
 249       * Adds a state to the stack and sets it
 250       * to be the current state.
 251       * @param string $state        New state.
 252       * @access public
 253       */
 254      function enter($state) {
 255          array_push($this->_stack, $state);
 256      }
 257  
 258      /**
 259       * Leaves the current state and reverts
 260       * to the previous one.
 261       * @return boolean    False if we drop off
 262       *                    the bottom of the list.
 263       * @access public
 264       */
 265      function leave() {
 266          if (count($this->_stack) == 1) {
 267              return false;
 268          }
 269          array_pop($this->_stack);
 270          return true;
 271      }
 272  }
 273  
 274  /**
 275   * Accepts text and breaks it into tokens.
 276   * Some optimisation to make the sure the
 277   * content is only scanned by the PHP regex
 278   * parser once. Lexer modes must not start
 279   * with leading underscores.
 280   * @package Doku
 281   * @subpackage Lexer
 282   */
 283  class Doku_Lexer {
 284      var $_regexes;
 285      var $_parser;
 286      var $_mode;
 287      var $_mode_handlers;
 288      var $_case;
 289  
 290      /**
 291       * Sets up the lexer in case insensitive matching
 292       * by default.
 293       * @param Doku_Parser $parser  Handling strategy by
 294       *                                 reference.
 295       * @param string $start            Starting handler.
 296       * @param boolean $case            True for case sensitive.
 297       * @access public
 298       */
 299      function __construct($parser, $start = "accept", $case = false) {
 300          $this->_case = $case;
 301          /** @var Doku_LexerParallelRegex[] _regexes */
 302          $this->_regexes = array();
 303          $this->_parser = $parser;
 304          $this->_mode = new Doku_LexerStateStack($start);
 305          $this->_mode_handlers = array();
 306      }
 307  
 308      /**
 309       * Adds a token search pattern for a particular
 310       * parsing mode. The pattern does not change the
 311       * current mode.
 312       * @param string $pattern      Perl style regex, but ( and )
 313       *                             lose the usual meaning.
 314       * @param string $mode         Should only apply this
 315       *                             pattern when dealing with
 316       *                             this type of input.
 317       * @access public
 318       */
 319      function addPattern($pattern, $mode = "accept") {
 320          if (! isset($this->_regexes[$mode])) {
 321              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 322          }
 323          $this->_regexes[$mode]->addPattern($pattern);
 324      }
 325  
 326      /**
 327       * Adds a pattern that will enter a new parsing
 328       * mode. Useful for entering parenthesis, strings,
 329       * tags, etc.
 330       * @param string $pattern      Perl style regex, but ( and )
 331       *                             lose the usual meaning.
 332       * @param string $mode         Should only apply this
 333       *                             pattern when dealing with
 334       *                             this type of input.
 335       * @param string $new_mode     Change parsing to this new
 336       *                             nested mode.
 337       * @access public
 338       */
 339      function addEntryPattern($pattern, $mode, $new_mode) {
 340          if (! isset($this->_regexes[$mode])) {
 341              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 342          }
 343          $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 344      }
 345  
 346      /**
 347       * Adds a pattern that will exit the current mode
 348       * and re-enter the previous one.
 349       * @param string $pattern      Perl style regex, but ( and )
 350       *                             lose the usual meaning.
 351       * @param string $mode         Mode to leave.
 352       * @access public
 353       */
 354      function addExitPattern($pattern, $mode) {
 355          if (! isset($this->_regexes[$mode])) {
 356              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 357          }
 358          $this->_regexes[$mode]->addPattern($pattern, "__exit");
 359      }
 360  
 361      /**
 362       * Adds a pattern that has a special mode. Acts as an entry
 363       * and exit pattern in one go, effectively calling a special
 364       * parser handler for this token only.
 365       * @param string $pattern      Perl style regex, but ( and )
 366       *                             lose the usual meaning.
 367       * @param string $mode         Should only apply this
 368       *                             pattern when dealing with
 369       *                             this type of input.
 370       * @param string $special      Use this mode for this one token.
 371       * @access public
 372       */
 373      function addSpecialPattern($pattern, $mode, $special) {
 374          if (! isset($this->_regexes[$mode])) {
 375              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 376          }
 377          $this->_regexes[$mode]->addPattern($pattern, "_$special");
 378      }
 379  
 380      /**
 381       * Adds a mapping from a mode to another handler.
 382       * @param string $mode        Mode to be remapped.
 383       * @param string $handler     New target handler.
 384       * @access public
 385       */
 386      function mapHandler($mode, $handler) {
 387          $this->_mode_handlers[$mode] = $handler;
 388      }
 389  
 390      /**
 391       * Splits the page text into tokens. Will fail
 392       * if the handlers report an error or if no
 393       * content is consumed. If successful then each
 394       * unparsed and parsed token invokes a call to the
 395       * held listener.
 396       * @param string $raw        Raw HTML text.
 397       * @return boolean           True on success, else false.
 398       * @access public
 399       */
 400      function parse($raw) {
 401          if (! isset($this->_parser)) {
 402              return false;
 403          }
 404          $initialLength = strlen($raw);
 405          $length = $initialLength;
 406          $pos = 0;
 407          while (is_array($parsed = $this->_reduce($raw))) {
 408              list($unmatched, $matched, $mode) = $parsed;
 409              $currentLength = strlen($raw);
 410              $matchPos = $initialLength - $currentLength - strlen($matched);
 411              if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 412                  return false;
 413              }
 414              if ($currentLength == $length) {
 415                  return false;
 416              }
 417              $length = $currentLength;
 418              $pos = $initialLength - $currentLength;
 419          }
 420          if (!$parsed) {
 421              return false;
 422          }
 423          return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
 424      }
 425  
 426      /**
 427       * Sends the matched token and any leading unmatched
 428       * text to the parser changing the lexer to a new
 429       * mode if one is listed.
 430       * @param string $unmatched Unmatched leading portion.
 431       * @param string $matched Actual token match.
 432       * @param bool|string $mode Mode after match. A boolean
 433       *                             false mode causes no change.
 434       * @param int $initialPos
 435       * @param int $matchPos
 436       *                             Current byte index location in raw doc
 437       *                             thats being parsed
 438       * @return boolean             False if there was any error
 439       *                             from the parser.
 440       * @access private
 441       */
 442      function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
 443          if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
 444              return false;
 445          }
 446          if ($this->_isModeEnd($mode)) {
 447              if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
 448                  return false;
 449              }
 450              return $this->_mode->leave();
 451          }
 452          if ($this->_isSpecialMode($mode)) {
 453              $this->_mode->enter($this->_decodeSpecial($mode));
 454              if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 455                  return false;
 456              }
 457              return $this->_mode->leave();
 458          }
 459          if (is_string($mode)) {
 460              $this->_mode->enter($mode);
 461              return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
 462          }
 463          return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
 464      }
 465  
 466      /**
 467       * Tests to see if the new mode is actually to leave
 468       * the current mode and pop an item from the matching
 469       * mode stack.
 470       * @param string $mode    Mode to test.
 471       * @return boolean        True if this is the exit mode.
 472       * @access private
 473       */
 474      function _isModeEnd($mode) {
 475          return ($mode === "__exit");
 476      }
 477  
 478      /**
 479       * Test to see if the mode is one where this mode
 480       * is entered for this token only and automatically
 481       * leaves immediately afterwoods.
 482       * @param string $mode    Mode to test.
 483       * @return boolean        True if this is the exit mode.
 484       * @access private
 485       */
 486      function _isSpecialMode($mode) {
 487          return (strncmp($mode, "_", 1) == 0);
 488      }
 489  
 490      /**
 491       * Strips the magic underscore marking single token
 492       * modes.
 493       * @param string $mode    Mode to decode.
 494       * @return string         Underlying mode name.
 495       * @access private
 496       */
 497      function _decodeSpecial($mode) {
 498          return substr($mode, 1);
 499      }
 500  
 501      /**
 502       * Calls the parser method named after the current
 503       * mode. Empty content will be ignored. The lexer
 504       * has a parser handler for each mode in the lexer.
 505       * @param string $content Text parsed.
 506       * @param boolean $is_match Token is recognised rather
 507       *                               than unparsed data.
 508       * @param int $pos Current byte index location in raw doc
 509       *                             thats being parsed
 510       * @return bool
 511       * @access private
 512       */
 513      function _invokeParser($content, $is_match, $pos) {
 514          if (($content === "") || ($content === false)) {
 515              return true;
 516          }
 517          $handler = $this->_mode->getCurrent();
 518          if (isset($this->_mode_handlers[$handler])) {
 519              $handler = $this->_mode_handlers[$handler];
 520          }
 521  
 522          // modes starting with plugin_ are all handled by the same
 523          // handler but with an additional parameter
 524          if(substr($handler,0,7)=='plugin_'){
 525              list($handler,$plugin) = explode('_',$handler,2);
 526              return $this->_parser->$handler($content, $is_match, $pos, $plugin);
 527          }
 528  
 529              return $this->_parser->$handler($content, $is_match, $pos);
 530          }
 531  
 532      /**
 533       * Tries to match a chunk of text and if successful
 534       * removes the recognised chunk and any leading
 535       * unparsed data. Empty strings will not be matched.
 536       * @param string $raw         The subject to parse. This is the
 537       *                            content that will be eaten.
 538       * @return array              Three item list of unparsed
 539       *                            content followed by the
 540       *                            recognised token and finally the
 541       *                            action the parser is to take.
 542       *                            True if no match, false if there
 543       *                            is a parsing error.
 544       * @access private
 545       */
 546      function _reduce(&$raw) {
 547          if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 548              return false;
 549          }
 550          if ($raw === "") {
 551              return true;
 552          }
 553          if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
 554              list($unparsed, $match, $raw) = $split;
 555              return array($unparsed, $match, $action);
 556          }
 557          return true;
 558      }
 559  }
 560  
 561  /**
 562   * Escapes regex characters other than (, ) and /
 563   *
 564   * @TODO
 565   *
 566   * @param string $str
 567   *
 568   * @return mixed
 569   */
 570  function Doku_Lexer_Escape($str) {
 571      //$str = addslashes($str);
 572      $chars = array(
 573          '/\\\\/',
 574          '/\./',
 575          '/\+/',
 576          '/\*/',
 577          '/\?/',
 578          '/\[/',
 579          '/\^/',
 580          '/\]/',
 581          '/\$/',
 582          '/\{/',
 583          '/\}/',
 584          '/\=/',
 585          '/\!/',
 586          '/\</',
 587          '/\>/',
 588          '/\|/',
 589          '/\:/'
 590          );
 591  
 592      $escaped = array(
 593          '\\\\\\\\',
 594          '\.',
 595          '\+',
 596          '\*',
 597          '\?',
 598          '\[',
 599          '\^',
 600          '\]',
 601          '\$',
 602          '\{',
 603          '\}',
 604          '\=',
 605          '\!',
 606          '\<',
 607          '\>',
 608          '\|',
 609          '\:'
 610          );
 611      return preg_replace($chars, $escaped, $str);
 612  }
 613  
 614  //Setup VIM: ex: et ts=4 sw=4 :