-
-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement the Chinese Whispers clustering algorithm in native PHP
To the happiness of many (Issue #690, #688, #687, #685, #649, #632, #627, #625, etc), this means that we do not depend on the pdlib extension, but it goes without saying that its use is still highly recommended. You will understand that it is slower, however I must admit that with JIT enabled, it is quite acceptable, and this is the only reason why decided to publish it. It is still experimental, and it works, but it has problems such as it seems not to converge in stable clusters. When I can fix this, it will probably be even slower.
- Loading branch information
1 parent
76fe598
commit 8854a1f
Showing
5 changed files
with
186 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
<?php | ||
declare(strict_types=1); | ||
/** | ||
* @copyright Copyright (c) 2023, Matias De lellis | ||
* | ||
* @author Matias De lellis <mati86dl@gmail.com> | ||
* | ||
* @license AGPL-3.0-or-later | ||
* | ||
* This code is free software: you can redistribute it and/or modify | ||
* it under the terms of the GNU Affero General Public License, version 3, | ||
* as published by the Free Software Foundation. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU Affero General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Affero General Public License, version 3, | ||
* along with this program. If not, see <http://www.gnu.org/licenses/> | ||
* | ||
*/ | ||
|
||
namespace OCA\FaceRecognition\Clusterer; | ||
|
||
|
||
/** | ||
* This class implements the graph clustering algorithm described in the | ||
* paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its | ||
* Application to Natural Language Processing Problems by Chris Biemann. | ||
* | ||
* In particular, it tries to be a shameless copy of the original dlib | ||
* implementation. | ||
* - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h | ||
*/ | ||
class ChineseWhispers { | ||
|
||
/** | ||
* Cluster the dataset by assigning a label to each sample.from the edges | ||
*/ | ||
static public function predict(array &$edges, array &$labels, int $num_iterations = 100) | ||
{ | ||
$labels = []; | ||
if (count($edges) == 0) | ||
return 0; | ||
|
||
$neighbors = []; | ||
self::find_neighbor_ranges($edges, $neighbors); | ||
|
||
// Initialize the labels, each node gets a different label. | ||
for ($i = 0; $i < count($neighbors); ++$i) | ||
$labels[$i] = $i; | ||
|
||
for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter) | ||
{ | ||
// Pick a random node. | ||
$idx = random_int(0, count($neighbors) - 1); | ||
|
||
// Count how many times each label happens amongst our neighbors. | ||
$labels_to_counts = []; | ||
$end = $neighbors[$idx][1]; | ||
|
||
for ($i = $neighbors[$idx][0]; $i != $end; ++$i) | ||
{ | ||
$iLabelFirst = $edges[$i][1]; | ||
$iLabel = $labels[$iLabelFirst]; | ||
if (isset($labels_to_counts[$iLabel])) | ||
$labels_to_counts[$iLabel]++; | ||
else | ||
$labels_to_counts[$iLabel] = 1; | ||
} | ||
|
||
// find the most common label | ||
// std::map<unsigned long, double>::iterator i; | ||
$best_score = PHP_INT_MIN; | ||
$best_label = $labels[$idx]; | ||
foreach ($labels_to_counts as $key => $value) | ||
{ | ||
if ($value > $best_score) | ||
{ | ||
$best_score = $value; | ||
$best_label = $key; | ||
} | ||
} | ||
|
||
$labels[$idx] = $best_label; | ||
} | ||
|
||
// Remap the labels into a contiguous range. First we find the | ||
// mapping. | ||
$label_remap = []; | ||
for ($i = 0; $i < count($labels); ++$i) | ||
{ | ||
$next_id = count($label_remap); | ||
if (!isset($label_remap[$labels[$i]])) | ||
$label_remap[$labels[$i]] = $next_id; | ||
} | ||
// now apply the mapping to all the labels. | ||
for ($i = 0; $i < count($labels); ++$i) | ||
{ | ||
$labels[$i] = $label_remap[$labels[$i]]; | ||
} | ||
|
||
return count($label_remap); | ||
} | ||
|
||
static function find_neighbor_ranges (&$edges, &$neighbors) { | ||
// setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range | ||
// within edges that contains all node i's edges. | ||
$num_nodes = self::max_index_plus_one($edges); | ||
for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0]; | ||
$cur_node = 0; | ||
$start_idx = 0; | ||
for ($i = 0; $i < count($edges); ++$i) | ||
{ | ||
if ($edges[$i][0] != $cur_node) | ||
{ | ||
$neighbors[$cur_node] = [$start_idx, $i]; | ||
$start_idx = $i; | ||
$cur_node = $edges[$i][0]; | ||
} | ||
} | ||
if (count($neighbors) !== 0) | ||
$neighbors[$cur_node] = [$start_idx, count($edges)]; | ||
} | ||
|
||
static function max_index_plus_one ($pairs): int { | ||
if (count($pairs) === 0) | ||
{ | ||
return 0; | ||
} | ||
else { | ||
$max_idx = 0; | ||
for ($i = 0; $i < count($pairs); ++$i) | ||
{ | ||
if ($pairs[$i][0] > $max_idx) | ||
$max_idx = $pairs[$i][0]; | ||
if ($pairs[$i][1] > $max_idx) | ||
$max_idx = $pairs[$i][1]; | ||
} | ||
return $max_idx + 1; | ||
} | ||
} | ||
|
||
static function convert_unordered_to_ordered (&$edges, &$out_edges) | ||
{ | ||
$out_edges = []; | ||
for ($i = 0; $i < count($edges); ++$i) | ||
{ | ||
$out_edges[] = [$edges[$i][0], $edges[$i][1]]; | ||
if ($edges[$i][0] != $edges[$i][1]) | ||
$out_edges[] = [$edges[$i][1], $edges[$i][0]]; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters