Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTMLDiff Performance inhancement #54

Merged
merged 8 commits into from
Jun 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ vendor/
/demo/bower_components
/demo/node_modules
.DS_Store
.idea
30 changes: 26 additions & 4 deletions lib/Caxy/HtmlDiff/AbstractDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ abstract class AbstractDiff
*/
protected $purifierConfig = null;

/**
* @see array_slice_cached();
* @var bool
*/
protected $resetCache = false;

/**
* AbstractDiff constructor.
*
Expand Down Expand Up @@ -113,8 +119,6 @@ abstract public function build();
*/
public function initPurifier($defaultPurifierSerializerCache = null)
{
$HTMLPurifierConfig = null;

if (null !== $this->purifierConfig) {
$HTMLPurifierConfig = $this->purifierConfig;
} else {
Expand Down Expand Up @@ -423,8 +427,26 @@ protected function purifyHtml($html)

protected function splitInputsToWords()
{
$this->oldWords = $this->convertHtmlToListOfWords($this->explode($this->oldText));
$this->newWords = $this->convertHtmlToListOfWords($this->explode($this->newText));
$this->setOldWords($this->convertHtmlToListOfWords($this->explode($this->oldText)));
$this->setNewWords($this->convertHtmlToListOfWords($this->explode($this->newText)));
}

/**
* @param array $oldWords
*/
protected function setOldWords(array $oldWords)
{
$this->resetCache = true;
$this->oldWords = $oldWords;
}

/**
* @param array $newWords
*/
protected function setNewWords(array $newWords)
{
$this->resetCache = true;
$this->newWords = $newWords;
}

/**
Expand Down
101 changes: 84 additions & 17 deletions lib/Caxy/HtmlDiff/HtmlDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -694,28 +694,32 @@ protected function operations()
$positionInOld = 0;
$positionInNew = 0;
$operations = array();
$matches = $this->matchingBlocks();

$matches = $this->matchingBlocks();
$matches[] = new Match(count($this->oldWords), count($this->newWords), 0);

foreach ($matches as $i => $match) {
$matchStartsAtCurrentPositionInOld = ($positionInOld == $match->startInOld);
$matchStartsAtCurrentPositionInNew = ($positionInNew == $match->startInNew);
$action = 'none';
$matchStartsAtCurrentPositionInOld = ($positionInOld === $match->startInOld);
$matchStartsAtCurrentPositionInNew = ($positionInNew === $match->startInNew);

if ($matchStartsAtCurrentPositionInOld == false && $matchStartsAtCurrentPositionInNew == false) {
if ($matchStartsAtCurrentPositionInOld === false && $matchStartsAtCurrentPositionInNew === false) {
$action = 'replace';
} elseif ($matchStartsAtCurrentPositionInOld == true && $matchStartsAtCurrentPositionInNew == false) {
} elseif ($matchStartsAtCurrentPositionInOld === true && $matchStartsAtCurrentPositionInNew === false) {
$action = 'insert';
} elseif ($matchStartsAtCurrentPositionInOld == false && $matchStartsAtCurrentPositionInNew == true) {
} elseif ($matchStartsAtCurrentPositionInOld === false && $matchStartsAtCurrentPositionInNew === true) {
$action = 'delete';
} else { // This occurs if the first few words are the same in both versions
$action = 'none';
}
if ($action != 'none') {

if ($action !== 'none') {
$operations[] = new Operation($action, $positionInOld, $match->startInOld, $positionInNew, $match->startInNew);
}
if (count($match) != 0) {

if (count($match) !== 0) {
$operations[] = new Operation('equal', $match->startInOld, $match->endInOld(), $match->startInNew, $match->endInNew());
}

$positionInOld = $match->endInOld();
$positionInNew = $match->endInNew();
}
Expand Down Expand Up @@ -744,11 +748,14 @@ protected function matchingBlocks()
protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endInNew, &$matchingBlocks)
{
$match = $this->findMatch($startInOld, $endInOld, $startInNew, $endInNew);

if ($match !== null) {
if ($startInOld < $match->startInOld && $startInNew < $match->startInNew) {
$this->findMatchingBlocks($startInOld, $match->startInOld, $startInNew, $match->startInNew, $matchingBlocks);
}

$matchingBlocks[] = $match;

if ($match->endInOld() < $endInOld && $match->endInNew() < $endInNew) {
$this->findMatchingBlocks($match->endInOld(), $endInOld, $match->endInNew(), $endInNew, $matchingBlocks);
}
Expand All @@ -762,9 +769,13 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
*/
protected function stripTagAttributes($word)
{
$word = explode(' ', trim($word, '<>'));
$space = strpos($word, ' ', 1);

if ($space) {
return '<' . substr($word, 1, $space) . '>';
}

return '<'.$word[ 0 ].'>';
return trim($word, '<>');
}

/**
Expand All @@ -781,6 +792,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
$bestMatchInNew = $startInNew;
$bestMatchSize = 0;
$matchLengthAt = array();

for ($indexInOld = $startInOld; $indexInOld < $endInOld; ++$indexInOld) {
$newMatchLengthAt = array();
$index = $this->oldWords[ $indexInOld ];
Expand All @@ -798,16 +810,15 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
if ($indexInNew >= $endInNew) {
break;
}

$newMatchLength = (isset($matchLengthAt[ $indexInNew - 1 ]) ? $matchLengthAt[ $indexInNew - 1 ] : 0) + 1;
$newMatchLengthAt[ $indexInNew ] = $newMatchLength;

if ($newMatchLength > $bestMatchSize ||
(
$this->isGroupDiffs() &&
$bestMatchSize > 0 &&
preg_match(
'/^\s+$/',
implode('', array_slice($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
) {
$bestMatchInOld = $indexInOld - $newMatchLength + 1;
Expand All @@ -822,12 +833,68 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
if ($bestMatchSize != 0 &&
(
!$this->isGroupDiffs() ||
!preg_match('/^\s+$/', implode('', array_slice($this->oldWords, $bestMatchInOld, $bestMatchSize)))
!$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
) {
return new Match($bestMatchInOld, $bestMatchInNew, $bestMatchSize);
}

return;
return null;
}

/**
* @param string $str
*
* @return bool
*/
protected function isOnlyWhitespace($str)
{
// Slightly faster then using preg_match
return $str !== '' && (strlen(trim($str)) === 0);
}

/**
* Special array_slice function that caches its last request.
*
* The diff algorithm seems to request the same information many times in a row.
* by returning the previous answer the algorithm preforms way faster.
*
* The result is a string instead of an array, this way we safe on the amount of
* memory intensive implode() calls.
*
* @param array &$array
* @param integer $offset
* @param integer|null $length
*
* @return string
*/
protected function array_slice_cached(&$array, $offset, $length = null)
{
static $lastOffset = null;
static $lastLength = null;
static $cache = null;

// PHP has no support for by-reference comparing.
// to prevent false positive hits, reset the cache when the oldWords or newWords is changed.
if ($this->resetCache === true) {
$cache = null;

$this->resetCache = false;
}

if (
$cache !== null &&
$lastLength === $length &&
$lastOffset === $offset
) { // Hit
return $cache;
} // Miss

$lastOffset = $offset;
$lastLength = $length;

$cache = implode('', array_slice($array, $offset, $length));

return $cache;
}
}
6 changes: 6 additions & 0 deletions phpunit.xml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
bootstrap="./tests/Caxy/Tests/TestInit.php"
>

<groups>
<exclude>
<group>performance</group>
</exclude>
</groups>

<testsuites>
<testsuite name="php-htmldiff Test Suite">
<directory>./tests/Caxy/Tests/HtmlDiff</directory>
Expand Down
17 changes: 17 additions & 0 deletions tests/Caxy/Tests/AbstractTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php

namespace Caxy\Tests;

abstract class AbstractTest extends \PHPUnit_Framework_TestCase
{
protected function stripExtraWhitespaceAndNewLines($text)
{
return trim(
preg_replace(
'/>\s+</',
'><',
preg_replace('/\s+/S', " ", preg_replace("/[\n\r]/", '', $text))
)
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\HtmlDiff\HtmlDiffConfig;
use Caxy\Tests\AbstractTest;

class HTMLPurifierConfigTest extends \PHPUnit_Framework_TestCase
class HTMLPurifierConfigTest extends AbstractTest
{
/**
* @var \HTMLPurifier_Config
Expand Down
14 changes: 2 additions & 12 deletions tests/Caxy/Tests/HtmlDiff/Functional/HtmlDiffFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
namespace Caxy\Tests\HtmlDiff\Functional;

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\Tests\AbstractTest;
use Caxy\Tests\HtmlDiff\HtmlFileIterator;

class HtmlDiffFunctionalTest extends \PHPUnit_Framework_TestCase
class HtmlDiffFunctionalTest extends AbstractTest
{
/**
* @dataProvider diffContentProvider
Expand All @@ -26,15 +27,4 @@ public function diffContentProvider()
{
return new HtmlFileIterator(__DIR__.'/../../../../fixtures/HtmlDiff');
}

protected function stripExtraWhitespaceAndNewLines($text)
{
return trim(
preg_replace(
'/>\s+</',
'><',
preg_replace('/\s+/S', " ", preg_replace("/[\n\r]/", '', $text))
)
);
}
}
29 changes: 29 additions & 0 deletions tests/Caxy/Tests/HtmlDiff/Performance/PerformanceTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?php

namespace Caxy\Tests\HtmlDiff\Performance;

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\Tests\AbstractTest;

class PerformanceTest extends AbstractTest
{
/**
* @group performance
*/
public function testParagraphPerformance()
{
$fixturesPath = __DIR__ . '/../../../../fixtures/Performance/';

$expected = file_get_contents($fixturesPath . 'paragraphs_expected.html');

$diff = new HtmlDiff(
file_get_contents($fixturesPath . 'paragraphs.html'),
file_get_contents($fixturesPath . 'paragraphs_changed.html'),
'UTF-8', array()
);

$output = $diff->build();

$this->assertSame($this->stripExtraWhitespaceAndNewLines($output), $this->stripExtraWhitespaceAndNewLines($expected));
}
}
Loading