Skip to content
This repository was archived by the owner on Dec 7, 2018. It is now read-only.

Minimal change from xrstf's multibyte diff awareness patch #10

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 73 additions & 34 deletions finediff.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@
* @link http://www.raymondhill.net/finediff/
* @version 0.6
* @license MIT License (http://www.opensource.org/licenses/mit-license.php)
*
* 10-Dec-2011 (Christoph Mewes):
* - added UTF-8 support, fixed strange usage of htmlentities
*/

mb_internal_encoding('UTF-8');

/**
* Usage (simplest):
*
Expand Down Expand Up @@ -107,13 +112,13 @@ public function getFromLen() {
return 0;
}
public function getToLen() {
return strlen($this->text);
return mb_strlen($this->text);
}
public function getText() {
return $this->text;
}
public function getOpcode() {
$to_len = strlen($this->text);
$to_len = mb_strlen($this->text);
if ( $to_len === 1 ) {
return "i:{$this->text}";
}
Expand All @@ -130,7 +135,7 @@ public function getFromLen() {
return $this->fromLen;
}
public function getToLen() {
return strlen($this->text);
return mb_strlen($this->text);
}
public function getText() {
return $this->text;
Expand All @@ -142,7 +147,7 @@ public function getOpcode() {
else {
$del_opcode = "d{$this->fromLen}";
}
$to_len = strlen($this->text);
$to_len = mb_strlen($this->text);
if ( $to_len === 1 ) {
return "{$del_opcode}i:{$this->text}";
}
Expand Down Expand Up @@ -177,6 +182,8 @@ public function increase($size) {
* Collection of ops
*/
class FineDiffOps {
public $edits = array();

public function appendOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' ) {
$edits[] = new FineDiffCopyOp($from_len);
Expand All @@ -185,10 +192,9 @@ public function appendOpcode($opcode, $from, $from_offset, $from_len) {
$edits[] = new FineDiffDeleteOp($from_len);
}
else /* if ( $opcode === 'i' ) */ {
$edits[] = new FineDiffInsertOp(substr($from, $from_offset, $from_len));
$edits[] = new FineDiffInsertOp(mb_substr($from, $from_offset, $from_len));
}
}
public $edits = array();
}

/**
Expand Down Expand Up @@ -299,14 +305,14 @@ public static function renderFromOpcodes($from, $opcodes, $callback) {
if ( !is_callable($callback) ) {
return;
}
$opcodes_len = strlen($opcodes);
$opcodes_len = mb_strlen($opcodes);
$from_offset = $opcodes_offset = 0;
while ( $opcodes_offset < $opcodes_len ) {
$opcode = substr($opcodes, $opcodes_offset, 1);
$opcode = mb_substr($opcodes, $opcodes_offset, 1);
$opcodes_offset++;
$n = intval(substr($opcodes, $opcodes_offset));
$n = intval(mb_substr($opcodes, $opcodes_offset));
if ( $n ) {
$opcodes_offset += strlen(strval($n));
$opcodes_offset += mb_strlen(strval($n));
}
else {
$n = 1;
Expand Down Expand Up @@ -394,7 +400,7 @@ private function _processGranularity($from_segment, $to_segment) {
// increase granularity
if ( $fragment_edit instanceof FineDiffReplaceOp && $has_next_stage ) {
$this->_processGranularity(
substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()),
mb_substr($this->from_text, $this->from_offset, $fragment_edit->getFromLen()),
$fragment_edit->getText()
);
}
Expand Down Expand Up @@ -432,8 +438,8 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$result = array();

// fragment-level diffing
$from_text_len = strlen($from_text);
$to_text_len = strlen($to_text);
$from_text_len = mb_strlen($from_text);
$to_text_len = mb_strlen($to_text);
$from_fragments = FineDiff::extractFragments($from_text, $delimiters);
$to_fragments = FineDiff::extractFragments($to_text, $delimiters);

Expand All @@ -454,7 +460,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$result[$from_segment_start * 4] = new FineDiffDeleteOp($from_segment_length);
}
else if ( $to_segment_length ) {
$result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_length));
$result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_length));
}
continue;
}
Expand All @@ -468,7 +474,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {

while ( $from_base_fragment_index < $from_segment_end ) {
$from_base_fragment = $from_fragments[$from_base_fragment_index];
$from_base_fragment_length = strlen($from_base_fragment);
$from_base_fragment_length = mb_strlen($from_base_fragment);
// performance boost: cache array keys
if ( !isset($cached_array_keys_for_current_segment[$from_base_fragment]) ) {
if ( !isset($cached_array_keys[$from_base_fragment]) ) {
Expand Down Expand Up @@ -510,7 +516,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
if ( $from_fragments[$fragment_from_index] !== $to_fragments[$fragment_to_index] ) {
break;
}
$fragment_length = strlen($from_fragments[$fragment_from_index]);
$fragment_length = mb_strlen($from_fragments[$fragment_from_index]);
$fragment_index_offset += $fragment_length;
}
if ( $fragment_index_offset > $best_copy_length ) {
Expand All @@ -519,7 +525,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$best_to_start = $to_base_fragment_index;
}
}
$from_base_fragment_index += strlen($from_base_fragment);
$from_base_fragment_index += mb_strlen($from_base_fragment);
// If match is larger than half segment size, no point trying to find better
// TODO: Really?
if ( $best_copy_length >= $from_segment_length / 2) {
Expand All @@ -538,7 +544,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
$jobs[] = array($best_from_start + $best_copy_length, $from_segment_end, $best_to_start + $best_copy_length, $to_segment_end);
}
else {
$result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, substr($to_text, $to_segment_start, $to_segment_length));
$result[$from_segment_start * 4 ] = new FineDiffReplaceOp($from_segment_length, mb_substr($to_text, $to_segment_start, $to_segment_length));
}
}

Expand All @@ -564,7 +570,7 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
*/
private static function doCharDiff($from_text, $to_text) {
$result = array();
$jobs = array(array(0, strlen($from_text), 0, strlen($to_text)));
$jobs = array(array(0, mb_strlen($from_text), 0, mb_strlen($to_text)));
while ( $job = array_pop($jobs) ) {
// get the segments which must be diff'ed
list($from_segment_start, $from_segment_end, $to_segment_start, $to_segment_end) = $job;
Expand All @@ -577,7 +583,7 @@ private static function doCharDiff($from_text, $to_text) {
$result[$from_segment_start * 4 + 0] = new FineDiffDeleteOp($from_segment_len);
}
else if ( $to_segment_len ) {
$result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(substr($to_text, $to_segment_start, $to_segment_len));
$result[$from_segment_start * 4 + 1] = new FineDiffInsertOp(mb_substr($to_text, $to_segment_start, $to_segment_len));
}
continue;
}
Expand All @@ -587,7 +593,7 @@ private static function doCharDiff($from_text, $to_text) {
$to_copy_start = $to_segment_start;
$to_copy_start_max = $to_segment_end - $copy_len;
while ( $to_copy_start <= $to_copy_start_max ) {
$from_copy_start = strpos(substr($from_text, $from_segment_start, $from_segment_len), substr($to_text, $to_copy_start, $copy_len));
$from_copy_start = mb_strpos(mb_substr($from_text, $from_segment_start, $from_segment_len), mb_substr($to_text, $to_copy_start, $copy_len));
if ( $from_copy_start !== false ) {
$from_copy_start += $from_segment_start;
break 2;
Expand All @@ -603,7 +609,7 @@ private static function doCharDiff($from_text, $to_text) {
$from_copy_start = $from_segment_start;
$from_copy_start_max = $from_segment_end - $copy_len;
while ( $from_copy_start <= $from_copy_start_max ) {
$to_copy_start = strpos(substr($to_text, $to_segment_start, $to_segment_len), substr($from_text, $from_copy_start, $copy_len));
$to_copy_start = mb_strpos(mb_substr($to_text, $to_segment_start, $to_segment_len), mb_substr($from_text, $from_copy_start, $copy_len));
if ( $to_copy_start !== false ) {
$to_copy_start += $to_segment_start;
break 2;
Expand All @@ -621,7 +627,7 @@ private static function doCharDiff($from_text, $to_text) {
}
// no match, so delete all, insert all
else {
$result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, substr($to_text, $to_segment_start, $to_segment_len));
$result[$from_segment_start * 4] = new FineDiffReplaceOp($from_segment_len, mb_substr($to_text, $to_segment_start, $to_segment_len));
}
}
ksort($result, SORT_NUMERIC);
Expand All @@ -641,19 +647,19 @@ private static function doCharDiff($from_text, $to_text) {
private static function extractFragments($text, $delimiters) {
// special case: split into characters
if ( empty($delimiters) ) {
$chars = str_split($text, 1);
$chars[strlen($text)] = '';
$chars = self::splitToChars($text);
$chars[] = '';
return $chars;
}
$fragments = array();
$start = $end = 0;
for (;;) {
$end += strcspn($text, $delimiters, $end);
$end += strspn($text, $delimiters, $end);
$end += self::mb_strcspn($text, $delimiters, $end);
$end += self::mb_strspn($text, $delimiters, $end);
if ( $end === $start ) {
break;
}
$fragments[$start] = substr($text, $start, $end - $start);
$fragments[$start] = mb_substr($text, $start, $end - $start);
$start = $end;
}
$fragments[$start] = '';
Expand All @@ -665,24 +671,57 @@ private static function extractFragments($text, $delimiters) {
*/
private static function renderToTextFromOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' || $opcode === 'i' ) {
echo substr($from, $from_offset, $from_len);
echo mb_substr($from, $from_offset, $from_len);
}
}

private static function renderDiffToHTMLFromOpcode($opcode, $from, $from_offset, $from_len) {
if ( $opcode === 'c' ) {
echo htmlentities(htmlentities(substr($from, $from_offset, $from_len)));
echo htmlspecialchars(mb_substr($from, $from_offset, $from_len));
}
else if ( $opcode === 'd' ) {
$deletion = substr($from, $from_offset, $from_len);
if ( strcspn($deletion, " \n\r") === 0 ) {
$deletion = mb_substr($from, $from_offset, $from_len);
if ( strcspn($deletion, " \n\r") === 0 ) { // no mb_ here is okay
$deletion = str_replace(array("\n","\r"), array('\n','\r'), $deletion);
}
echo '<del>', htmlentities(htmlentities($deletion)), '</del>';
echo '<del>', htmlspecialchars($deletion), '</del>';
}
else /* if ( $opcode === 'i' ) */ {
echo '<ins>', htmlentities(htmlentities(substr($from, $from_offset, $from_len))), '</ins>';
echo '<ins>', htmlspecialchars(mb_substr($from, $from_offset, $from_len), ENT_QUOTES), '</ins>';
}
}

private static function splitToChars($str) {
preg_match_all('/./us', $str, $matches);
$matches = $matches[0];

if (count($matches) === 0) return array('');
return $matches;
}

private static function mb_strcspn($str, $delimiters, $start) {
$dels = self::splitToChars($delimiters);
$min = mb_strlen($str);

foreach ($dels as $del) {
$pos = mb_strpos($str, $del, $start);
if ($pos !== false && $pos < $min) $min = $pos;
}

return $min - $start;
}

private static function mb_strspn($str, $delimiters, $start) {
$str = mb_substr($str, $start);
$dels = self::splitToChars($delimiters);

foreach ($dels as $idx => $del) {
$dels[$idx] = preg_quote($del, '/');
}

$dels = implode('|', $dels);

preg_match("/^($dels)+/us", $str, $match);
return $match ? mb_strlen($match[0]) : 0;
}
}