You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

246 lines
6.0 KiB
PHP

<?php
namespace Highlight;
/**
* @internal
*
* @since 9.16.0.0
*/
final class Terminators
{
/** @var bool */
private $caseInsensitive;
/** @var array<int, Mode|string> */
private $matchIndexes = array();
/** @var RegEx|null */
private $matcherRe = null;
/** @var array<int, array<int, Mode|string>> */
private $regexes = array();
/** @var int */
private $matchAt = 1;
/** @var Mode */
private $mode;
/** @var int */
public $lastIndex = 0;
/**
* @param bool $caseInsensitive
*/
public function __construct($caseInsensitive)
{
$this->caseInsensitive = $caseInsensitive;
}
/**
* @internal
*
* @param Mode $mode
*
* @return self
*/
public function _buildModeRegex($mode)
{
$this->mode = $mode;
$term = null;
for ($i = 0; $i < count($mode->contains); ++$i) {
$re = null;
$term = $mode->contains[$i];
if ($term->beginKeywords) {
$re = "\.?(?:" . $term->begin . ")\.?";
} else {
$re = $term->begin;
}
$this->addRule($term, $re);
}
if ($mode->terminator_end) {
$this->addRule('end', $mode->terminator_end);
}
if ($mode->illegal) {
$this->addRule('illegal', $mode->illegal);
}
/** @var array<int, string> $terminators */
$terminators = array();
foreach ($this->regexes as $regex) {
$terminators[] = $regex[1];
}
$this->matcherRe = $this->langRe($this->joinRe($terminators, '|'), true);
$this->lastIndex = 0;
return $this;
}
/**
* @param string $s
*
* @return RegExMatch|null
*/
public function exec($s)
{
if (count($this->regexes) === 0) {
return null;
}
$this->matcherRe->lastIndex = $this->lastIndex;
$match = $this->matcherRe->exec($s);
if (!$match) {
return null;
}
/** @var Mode|string $rule */
$rule = null;
for ($i = 0; $i < count($match); ++$i) {
if ($match[$i] !== null && isset($this->matchIndexes[$i])) {
$rule = $this->matchIndexes[$i];
break;
}
}
if (is_string($rule)) {
$match->type = $rule;
$match->extra = array($this->mode->illegal, $this->mode->terminator_end);
} else {
$match->type = "begin";
$match->rule = $rule;
}
return $match;
}
/**
* @param string $value
* @param bool $global
*
* @return RegEx
*/
private function langRe($value, $global = false)
{
return RegExUtils::langRe($value, $global, $this->caseInsensitive);
}
/**
* @param Mode|string $rule
* @param string $regex
*
* @return void
*/
private function addRule($rule, $regex)
{
$this->matchIndexes[$this->matchAt] = $rule;
$this->regexes[] = array($rule, $regex);
$this->matchAt += $this->reCountMatchGroups($regex) + 1;
}
/**
* joinRe logically computes regexps.join(separator), but fixes the
* backreferences so they continue to match.
*
* it also places each individual regular expression into it's own
* match group, keeping track of the sequencing of those match groups
* is currently an exercise for the caller. :-)
*
* @param array<int, string> $regexps
* @param string $separator
*
* @return string
*/
private function joinRe($regexps, $separator)
{
// backreferenceRe matches an open parenthesis or backreference. To avoid
// an incorrect parse, it additionally matches the following:
// - [...] elements, where the meaning of parentheses and escapes change
// - other escape sequences, so we do not misparse escape sequences as
// interesting elements
// - non-matching or lookahead parentheses, which do not capture. These
// follow the '(' with a '?'.
$backreferenceRe = '#\[(?:[^\\\\\]]|\\\.)*\]|\(\??|\\\([1-9][0-9]*)|\\\.#';
$numCaptures = 0;
$ret = '';
$strLen = count($regexps);
for ($i = 0; $i < $strLen; ++$i) {
++$numCaptures;
$offset = $numCaptures;
$re = $this->reStr($regexps[$i]);
if ($i > 0) {
$ret .= $separator;
}
$ret .= "(";
while (strlen($re) > 0) {
$matches = array();
$matchFound = preg_match($backreferenceRe, $re, $matches, PREG_OFFSET_CAPTURE);
if ($matchFound === 0) {
$ret .= $re;
break;
}
// PHP aliases to match the JS naming conventions
$match = $matches[0];
$index = $match[1];
$ret .= substr($re, 0, $index);
$re = substr($re, $index + strlen($match[0]));
if (substr($match[0], 0, 1) === '\\' && isset($matches[1])) {
// Adjust the backreference.
$ret .= "\\" . strval(intval($matches[1][0]) + $offset);
} else {
$ret .= $match[0];
if ($match[0] == "(") {
++$numCaptures;
}
}
}
$ret .= ")";
}
return $ret;
}
/**
* @param RegEx|string $re
*
* @return mixed
*/
private function reStr($re)
{
if ($re && isset($re->source)) {
return $re->source;
}
return $re;
}
/**
* @param RegEx|string $re
*
* @return int
*/
private function reCountMatchGroups($re)
{
$results = array();
$escaped = preg_replace('#(?<!\\\)/#um', '\\/', (string) $re);
preg_match_all("/{$escaped}|/u", '', $results);
return count($results) - 1;
}
}