Skip to content

Commit

Permalink
Use delimiter position to optimize processing
Browse files Browse the repository at this point in the history
Delimiters may be deleted, so we store delimiter positions instead of
pointers. This also allows us to optimize searches within the stack,
avoiding quadratic behavior when parsing emphasis.

See github/cmark-gfm@75008f1
  • Loading branch information
colinodell committed Dec 7, 2024
1 parent e7584cf commit b61bbd4
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 51 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi
- `[` and `]` are no longer added as `Delimiter` objects on the stack; a new `Bracket` type with its own stack is used instead
- `UrlAutolinkParser` no longer parses URLs with more than 127 subdomains
- Expanded reference links can no longer exceed 100kb, or the size of the input document (whichever is greater)
- Delimiters should always provide a non-null value via `DelimiterInterface::getIndex()`
- We'll attempt to infer the index based on surrounding delimiters where possible
- The `DelimiterStack` now accepts integer positions for any `$stackBottom` argument
- Several small performance optimizations

## [2.5.3] - 2024-08-16
Expand Down Expand Up @@ -95,14 +98,16 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi

- Returning dynamic values from `DelimiterProcessorInterface::getDelimiterUse()` is deprecated
- You should instead implement `CacheableDelimiterProcessorInterface` to help the engine perform caching to avoid performance issues.
- Failing to set a delimiter's index (or returning `null` from `DelimiterInterface::getIndex()`) is deprecated and will not be supported in 3.0
- Deprecated `DelimiterInterface::isActive()` and `DelimiterInterface::setActive()`, as these are no longer used by the engine
- Deprecated `DelimiterStack::removeEarlierMatches()` and `DelimiterStack::searchByCharacter()`, as these are no longer used by the engine
- Passing a `DelimiterInterface` as the `$stackBottom` argument to `DelimiterStack::processDelimiters()` or `::removeAll()` is deprecated and will not be supported in 3.0; pass the integer position instead.

### Fixed

- Fixed NUL characters not being replaced in the input
- Fixed quadratic complexity parsing unclosed inline links
- Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters
- Fixed quadratic complexity parsing emphasis and strikethrough delimiters
- Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606)
- Fixed quadratic complexity deactivating link openers
- Fixed catastrophic backtracking when parsing link labels/titles
Expand Down
2 changes: 2 additions & 0 deletions phpstan.neon.dist
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ parameters:
message: '#Parameter .+ of class .+Reference constructor expects string, string\|null given#'
- path: src/Util/RegexHelper.php
message: '#Method .+RegexHelper::unescape\(\) should return string but returns string\|null#'
- path: src/Delimiter/DelimiterStack.php
message: '#unknown class WeakMap#'
exceptions:
uncheckedExceptionClasses:
# Exceptions caused by bad developer logic that should always bubble up:
Expand Down
23 changes: 8 additions & 15 deletions src/Delimiter/Bracket.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,17 @@ final class Bracket
{
private Node $node;
private ?Bracket $previous;
private ?DelimiterInterface $previousDelimiter;
private bool $hasNext = false;
private int $index;
private int $position;
private bool $image;
private bool $active = true;

public function __construct(Node $node, ?Bracket $previous, ?DelimiterInterface $previousDelimiter, int $index, bool $image)
public function __construct(Node $node, ?Bracket $previous, int $position, bool $image)
{
$this->node = $node;
$this->previous = $previous;
$this->previousDelimiter = $previousDelimiter;
$this->index = $index;
$this->image = $image;
$this->node = $node;
$this->previous = $previous;
$this->position = $position;
$this->image = $image;
}

public function getNode(): Node
Expand All @@ -44,19 +42,14 @@ public function getPrevious(): ?Bracket
return $this->previous;
}

public function getPreviousDelimiter(): ?DelimiterInterface
{
return $this->previousDelimiter;
}

public function hasNext(): bool
{
return $this->hasNext;
}

public function getIndex(): int
public function getPosition(): int
{
return $this->index;
return $this->position;
}

public function isImage(): bool
Expand Down
129 changes: 114 additions & 15 deletions src/Delimiter/DelimiterStack.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface;
use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection;
use League\CommonMark\Exception\LogicException;
use League\CommonMark\Node\Inline\AdjacentTextMerger;
use League\CommonMark\Node\Node;

Expand All @@ -32,6 +33,23 @@ final class DelimiterStack
/** @psalm-readonly-allow-private-mutation */
private ?Bracket $brackets = null;

/**
* @deprecated This property will be removed in 3.0 once all delimiters MUST have an index/position
*
* @var \SplObjectStorage<DelimiterInterface, int>|\WeakMap<DelimiterInterface, int>
*/
private $missingIndexCache;

public function __construct()
{
if (\PHP_VERSION_ID >= 80000) {
/** @psalm-suppress PropertyTypeCoercion */
$this->missingIndexCache = new \WeakMap(); // @phpstan-ignore-line
} else {
$this->missingIndexCache = new \SplObjectStorage(); // @phpstan-ignore-line
}
}

public function push(DelimiterInterface $newDelimiter): void
{
$newDelimiter->setPrevious($this->top);
Expand All @@ -52,7 +70,7 @@ public function addBracket(Node $node, int $index, bool $image): void
$this->brackets->setHasNext(true);
}

$this->brackets = new Bracket($node, $this->brackets, $this->top, $index, $image);
$this->brackets = new Bracket($node, $this->brackets, $index, $image);
}

/**
Expand All @@ -63,14 +81,21 @@ public function getLastBracket(): ?Bracket
return $this->brackets;
}

private function findEarliest(?DelimiterInterface $stackBottom = null): ?DelimiterInterface
/**
* @throws LogicException
*/
private function findEarliest(int $stackBottom): ?DelimiterInterface
{
$delimiter = $this->top;
while ($delimiter !== null && $delimiter->getPrevious() !== $stackBottom) {
$delimiter = $delimiter->getPrevious();
// Move back to first relevant delim.
$delimiter = $this->top;
$lastChecked = null;

while ($delimiter !== null && self::getIndex($delimiter) > $stackBottom) {
$lastChecked = $delimiter;
$delimiter = $delimiter->getPrevious();
}

return $delimiter;
return $lastChecked;
}

/**
Expand Down Expand Up @@ -113,6 +138,9 @@ public function removeDelimiter(DelimiterInterface $delimiter): void
// segfaults like in https://bugs.php.net/bug.php?id=68606.
$delimiter->setPrevious(null);
$delimiter->setNext(null);

// TODO: Remove the line below once PHP 7.4 support is dropped, as WeakMap won't hold onto the reference, making this unnecessary
unset($this->missingIndexCache[$delimiter]);
}

private function removeDelimiterAndNode(DelimiterInterface $delimiter): void
Expand All @@ -121,19 +149,30 @@ private function removeDelimiterAndNode(DelimiterInterface $delimiter): void
$this->removeDelimiter($delimiter);
}

/**
* @throws LogicException
*/
private function removeDelimitersBetween(DelimiterInterface $opener, DelimiterInterface $closer): void
{
$delimiter = $closer->getPrevious();
while ($delimiter !== null && $delimiter !== $opener) {
$delimiter = $closer->getPrevious();
$openerPosition = self::getIndex($opener);
while ($delimiter !== null && self::getIndex($delimiter) > $openerPosition) {
$previous = $delimiter->getPrevious();
$this->removeDelimiter($delimiter);
$delimiter = $previous;
}
}

public function removeAll(?DelimiterInterface $stackBottom = null): void
/**
* @param DelimiterInterface|int|null $stackBottom
*
* @throws LogicException if the index/position cannot be determined for some delimiter
*/
public function removeAll($stackBottom = null): void
{
while ($this->top && $this->top !== $stackBottom) {
$stackBottomPosition = \is_int($stackBottom) ? $stackBottom : self::getIndex($stackBottom);

while ($this->top && $this->getIndex($this->top) > $stackBottomPosition) {
$this->removeDelimiter($this->top);
}
}
Expand Down Expand Up @@ -188,12 +227,22 @@ public function searchByCharacter($characters): ?DelimiterInterface
return $opener;
}

public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterProcessorCollection $processors): void
/**
* @param DelimiterInterface|int|null $stackBottom
*
* @throws LogicException if the index/position cannot be determined for any delimiter
*
* @todo change $stackBottom to an int in 3.0
*/
public function processDelimiters($stackBottom, DelimiterProcessorCollection $processors): void
{
/** @var array<string, int> $openersBottom */
$openersBottom = [];

$stackBottomPosition = \is_int($stackBottom) ? $stackBottom : self::getIndex($stackBottom);

// Find first closer above stackBottom
$closer = $this->findEarliest($stackBottom);
$closer = $this->findEarliest($stackBottomPosition);

// Move forward, looking for closers, and handling each
while ($closer !== null) {
Expand All @@ -217,7 +266,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro
$openerFound = false;
$potentialOpenerFound = false;
$opener = $closer->getPrevious();
while ($opener !== null && $opener !== $stackBottom && $opener !== ($openersBottom[$openersBottomCacheKey] ?? null)) {
while ($opener !== null && ($openerPosition = self::getIndex($opener)) > $stackBottomPosition && $openerPosition >= ($openersBottom[$openersBottomCacheKey] ?? 0)) {
if ($opener->canOpen() && $opener->getChar() === $openingDelimiterChar) {
$potentialOpenerFound = true;
$useDelims = $delimiterProcessor->getDelimiterUse($opener, $closer);
Expand All @@ -234,7 +283,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro
// Set lower bound for future searches
// TODO: Remove this conditional check in 3.0. It only exists to prevent behavioral BC breaks in 2.x.
if ($potentialOpenerFound === false || $delimiterProcessor instanceof CacheableDelimiterProcessorInterface) {
$openersBottom[$openersBottomCacheKey] = $closer->getPrevious();
$openersBottom[$openersBottomCacheKey] = self::getIndex($closer);
}

if (! $potentialOpenerFound && ! $closer->canOpen()) {
Expand Down Expand Up @@ -282,7 +331,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro
}

// Remove all delimiters
$this->removeAll($stackBottom);
$this->removeAll($stackBottomPosition);
}

/**
Expand All @@ -298,4 +347,54 @@ public function __destruct()
$this->removeBracket();
}
}

/**
* @deprecated This method will be dropped in 3.0 once all delimiters MUST have an index/position
*
* @throws LogicException if no index was defined on this delimiter, and no reasonable guess could be made based on its neighbors
*/
private function getIndex(?DelimiterInterface $delimiter): int
{
if ($delimiter === null) {
return -1;
}

if (($index = $delimiter->getIndex()) !== null) {
return $index;
}

if (isset($this->missingIndexCache[$delimiter])) {
return $this->missingIndexCache[$delimiter];
}

$prev = $delimiter->getPrevious();
$next = $delimiter->getNext();

$i = 0;
do {
$i++;
if ($prev === null) {
break;
}

if ($prev->getIndex() !== null) {
return $this->missingIndexCache[$delimiter] = $prev->getIndex() + $i;
}
} while ($prev = $prev->getPrevious());

$j = 0;
do {
$j++;
if ($next === null) {
break;
}

if ($next->getIndex() !== null) {
return $this->missingIndexCache[$delimiter] = $next->getIndex() - $j;
}
} while ($next = $next->getNext());

// No index was defined on this delimiter, and none could be guesstimated based on the stack.
throw new LogicException('No index was defined on this delimiter, and none could be guessed based on the stack. Ensure you are passing the index when instantiating the Delimiter.');
}
}
4 changes: 2 additions & 2 deletions src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public function parse(InlineParserContext $inlineContext): bool

// Process delimiters such as emphasis inside link/image
$delimiterStack = $inlineContext->getDelimiterStack();
$stackBottom = $opener->getPreviousDelimiter();
$stackBottom = $opener->getPosition();
$delimiterStack->processDelimiters($stackBottom, $this->environment->getDelimiterProcessors());
$delimiterStack->removeBracket();
$delimiterStack->removeAll($stackBottom);
Expand Down Expand Up @@ -179,7 +179,7 @@ private function tryParseReference(Cursor $cursor, ReferenceMapInterface $refere
} elseif (! $opener->hasNext()) {
// Empty or missing second label means to use the first label as the reference.
// The reference must not contain a bracket. If we know there's a bracket, we don't even bother checking it.
$start = $opener->getIndex();
$start = $opener->getPosition();
$length = $startPos - $start;
} else {
$cursor->restoreState($savePos);
Expand Down
3 changes: 2 additions & 1 deletion src/Extension/SmartPunct/QuoteParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ public function parse(InlineParserContext $inlineContext): bool
{
$char = $inlineContext->getFullMatch();
$cursor = $inlineContext->getCursor();
$index = $cursor->getPosition();

$charBefore = $cursor->peek(-1);
if ($charBefore === null) {
Expand All @@ -67,7 +68,7 @@ public function parse(InlineParserContext $inlineContext): bool
$inlineContext->getContainer()->appendChild($node);

// Add entry to stack to this opener
$inlineContext->getDelimiterStack()->push(new Delimiter($char, 1, $node, $canOpen, $canClose));
$inlineContext->getDelimiterStack()->push(new Delimiter($char, 1, $node, $canOpen, $canClose, $index));

return true;
}
Expand Down
4 changes: 4 additions & 0 deletions src/Extension/Table/TableParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

namespace League\CommonMark\Extension\Table;

use League\CommonMark\Exception\LogicException;
use League\CommonMark\Parser\Block\AbstractBlockContinueParser;
use League\CommonMark\Parser\Block\BlockContinue;
use League\CommonMark\Parser\Block\BlockContinueParserInterface;
Expand Down Expand Up @@ -150,6 +151,9 @@ public function parseInlines(InlineParserEngineInterface $inlineParser): void
}
}

/**
* @throws LogicException
*/
private function parseCell(string $cell, int $column, InlineParserEngineInterface $inlineParser): TableCell
{
$tableCell = new TableCell(TableCell::TYPE_DATA, $this->columns[$column] ?? null);
Expand Down
3 changes: 3 additions & 0 deletions src/Parser/Block/BlockContinueParserWithInlinesInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@

namespace League\CommonMark\Parser\Block;

use League\CommonMark\Exception\LogicException;
use League\CommonMark\Parser\InlineParserEngineInterface;

interface BlockContinueParserWithInlinesInterface extends BlockContinueParserInterface
{
/**
* Parse any inlines inside of the current block
*
* @throws LogicException
*/
public function parseInlines(InlineParserEngineInterface $inlineParser): void;
}
4 changes: 4 additions & 0 deletions src/Parser/Inline/InlineParserInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@

namespace League\CommonMark\Parser\Inline;

use League\CommonMark\Exception\LogicException;
use League\CommonMark\Parser\InlineParserContext;

interface InlineParserInterface
{
public function getMatchDefinition(): InlineParserMatch;

/**
* @throws LogicException
*/
public function parse(InlineParserContext $inlineContext): bool;
}
Loading

0 comments on commit b61bbd4

Please sign in to comment.