Skip to content

Commit

Permalink
Implement the Chinese Whispers clustering algorithm in PHP code.
Browse files Browse the repository at this point in the history
To the happiness of many (Issues #690, #688, #687, #685, #649, #632
, #627, #625, etc..?) this means that we do not depend on the pdlib
extension, but it goes without saying that its use is still highly
recommended.

You will understand that it is slower, however I must admit that with
JIT enabled, it is quite acceptable, and this is the only reason why
decided to publish it.
  • Loading branch information
matiasdelellis committed Aug 23, 2023
1 parent 76fe598 commit 5574532
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 28 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
matrix:
php-versions: ['8.1']
databases: ['mysql']
server-versions: ['stable26']
server-versions: ['stable27']

name: php${{ matrix.php-versions }}-${{ matrix.databases }}-${{ matrix.server-versions }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/phpunit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
matrix:
php-versions: ['8.0', '8.1', '8.2']
databases: ['sqlite', 'mysql', 'pgsql']
server-versions: ['stable26']
server-versions: ['stable27']

name: php${{ matrix.php-versions }}-${{ matrix.databases }}-${{ matrix.server-versions }}

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/static-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
ocp-version: ['dev-stable26']
ocp-version: ['dev-stable27']
name: Nextcloud ${{ matrix.ocp-version }}
steps:
- name: Checkout
Expand Down
6 changes: 2 additions & 4 deletions appinfo/info.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
- **🚀 Build your own thing:** FaceRecognition app is just a basic building block. Through FaceRecognition API, you can build your advanced scenarios - automatically add tags to images, connect contacts and persons, share images from specific person… We want to hear your ideas!
]]>
</description>
<version>0.9.20</version>
<version>0.9.30</version>
<licence>agpl</licence>
<author>Matias De lellis</author>
<author>Branko Kokanovic</author>
Expand All @@ -34,9 +34,7 @@
<screenshot>https://matiasdelellis.github.io/img/facerecognition/facerecognition-assign-initial-name.jpeg</screenshot>
<dependencies>
<php min-version="8.0" max-version="8.2" />
<lib>pdlib</lib>
<lib>bz2</lib>
<nextcloud min-version="26" max-version="27"/>
<nextcloud min-version="27" max-version="27"/>
</dependencies>
<repair-steps>
<uninstall>
Expand Down
14 changes: 4 additions & 10 deletions lib/BackgroundJob/Tasks/CheckRequirementsTask.php
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ public function execute(FaceRecognitionContext $context) {
$phpMemory = MemoryLimits::getPhpMemory();
$this->logDebug("PHP Memory Limit: " . ($phpMemory > 0 ? $phpMemory : "Unknown"));

$this->logDebug("Clustering backend: " . (Requirements::pdlibLoaded() ? "pdlib" : "PHP (Not recommended."));

if ($this->imaginaryHelper->isEnabled()) {
$this->logDebug("Backend of images: Imaginary");
$this->logDebug("Image Backend: Imaginary");
$version = $this->imaginaryHelper->getVersion();
if ($version) {
$this->logDebug("Imaginary version: " . $version);
Expand All @@ -104,15 +106,7 @@ public function execute(FaceRecognitionContext $context) {
return false;
}
} else {
$this->logDebug("Backend of images: Imagick");
}

if (!Requirements::pdlibLoaded()) {
$error_message =
"The PDlib PHP extension is not loaded. Cannot continue without it." .
"Please read the documentation again about how to install the application: https://github.com/matiasdelellis/facerecognition/wiki/Installation";
$this->logInfo($error_message);
return false;
$this->logDebug("Image Backend: Imagick");
}

if (!Requirements::hasEnoughMemory()) {
Expand Down
32 changes: 25 additions & 7 deletions lib/BackgroundJob/Tasks/CreateClustersTask.php
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?php
/**
* @copyright Copyright (c) 2017-2020 Matias De lellis <mati86dl@gmail.com>
* @copyright Copyright (c) 2017-2023 Matias De lellis <mati86dl@gmail.com>
* @copyright Copyright (c) 2018, Branko Kokanovic <branko@kokanovic.org>
*
* @author Branko Kokanovic <branko@kokanovic.org>
Expand Down Expand Up @@ -33,6 +33,9 @@
use OCA\FaceRecognition\Db\PersonMapper;

use OCA\FaceRecognition\Helper\Euclidean;
use OCA\FaceRecognition\Helper\Requirements;

use OCA\FaceRecognition\Clusterer\ChineseWhispers;

use OCA\FaceRecognition\Service\SettingsService;
/**
Expand Down Expand Up @@ -282,10 +285,9 @@ private function getNewClusters(array $faces): array {
// Clustering parameters
$sensitivity = $this->settingsService->getSensitivity();

// Create edges for chinese whispers
$edges = array();

if (version_compare(phpversion('pdlib'), '1.0.2', '>=')) {
if (Requirements::pdlibLoaded()) {
// Create edges (neighbors) for Chinese Whispers
$edges = array();
$faces_count = count($faces);
for ($i = 0; $i < $faces_count; $i++) {
$face1 = $faces[$i];
Expand All @@ -304,8 +306,14 @@ private function getNewClusters(array $faces): array {
}
}
}

// Given the edges get the list of labels (found clusters) for each face.
$newChineseClustersByIndex = dlib_chinese_whispers($edges);
} else {
// Create edges (neighbors) for Chinese Whispers
$edges = array();
$faces_count = count($faces);

for ($i = 0; $i < $faces_count; $i++) {
$face1 = $faces[$i];
if (!isset($face1->descriptor)) {
Expand All @@ -323,17 +331,27 @@ private function getNewClusters(array $faces): array {
}
}
}

// The clustering algorithm actually expects ordered lists.
$oedges = [];
ChineseWhispers::convert_unordered_to_ordered($edges, $oedges);
usort($oedges, function($a, $b) {
if ($a[0] === $b[0]) return $a[1] - $b[1];
return $a[0] - $b[0];
});

// Given the edges get the list of labels (found clusters) for each face.
$newChineseClustersByIndex = [];
ChineseWhispers::predict($oedges, $newChineseClustersByIndex);
}

$newChineseClustersByIndex = dlib_chinese_whispers($edges);
$newClusters = array();
for ($i = 0, $c = count($newChineseClustersByIndex); $i < $c; $i++) {
if (!isset($newClusters[$newChineseClustersByIndex[$i]])) {
$newClusters[$newChineseClustersByIndex[$i]] = array();
}
$newClusters[$newChineseClustersByIndex[$i]][] = $faces[$i]->id;
}

return $newClusters;
}

Expand Down
159 changes: 159 additions & 0 deletions lib/Clusterer/ChineseWhispers.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
<?php
declare(strict_types=1);
/**
* @copyright Copyright (c) 2023, Matias De lellis
*
* @author Matias De lellis <mati86dl@gmail.com>
*
* @license AGPL-3.0-or-later
*
* This code is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License, version 3,
* along with this program. If not, see <http://www.gnu.org/licenses/>
*
*/

namespace OCA\FaceRecognition\Clusterer;


/**
* This class implements the graph clustering algorithm described in the
* paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
* Application to Natural Language Processing Problems by Chris Biemann.
*
* In particular, it tries to be a shameless copy of the original dlib
* implementation.
* - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h
*/
class ChineseWhispers {

/**
* Cluster the dataset by assigning a label to each sample.from the edges
*/
static public function predict(array &$edges, array &$labels, int $num_iterations = 100)
{
// To improve the stability of the clusters, we must
// iterate the neighbors in a pseudo-random way.
mt_srand(2023);

$labels = [];
if (count($edges) == 0)
return 0;

$neighbors = [];
self::find_neighbor_ranges($edges, $neighbors);

// Initialize the labels, each node gets a different label.
for ($i = 0; $i < count($neighbors); ++$i)
$labels[$i] = $i;

for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter)
{
// Pick a random node.
$idx = mt_rand()%count($neighbors);

// Count how many times each label happens amongst our neighbors.
$labels_to_counts = [];
$end = $neighbors[$idx][1];

for ($i = $neighbors[$idx][0]; $i != $end; ++$i)
{
$iLabelFirst = $edges[$i][1];
$iLabel = $labels[$iLabelFirst];
if (isset($labels_to_counts[$iLabel]))
$labels_to_counts[$iLabel]++;
else
$labels_to_counts[$iLabel] = 1;
}

// find the most common label
// std::map<unsigned long, double>::iterator i;
$best_score = PHP_INT_MIN;
$best_label = $labels[$idx];
foreach ($labels_to_counts as $key => $value)
{
if ($value > $best_score)
{
$best_score = $value;
$best_label = $key;
}
}

$labels[$idx] = $best_label;
}

// Remap the labels into a contiguous range. First we find the
// mapping.
$label_remap = [];
for ($i = 0; $i < count($labels); ++$i)
{
$next_id = count($label_remap);
if (!isset($label_remap[$labels[$i]]))
$label_remap[$labels[$i]] = $next_id;
}
// now apply the mapping to all the labels.
for ($i = 0; $i < count($labels); ++$i)
{
$labels[$i] = $label_remap[$labels[$i]];
}

return count($label_remap);
}

static function find_neighbor_ranges (&$edges, &$neighbors) {
// setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range
// within edges that contains all node i's edges.
$num_nodes = self::max_index_plus_one($edges);
for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0];
$cur_node = 0;
$start_idx = 0;
for ($i = 0; $i < count($edges); ++$i)
{
if ($edges[$i][0] != $cur_node)
{
$neighbors[$cur_node] = [$start_idx, $i];
$start_idx = $i;
$cur_node = $edges[$i][0];
}
}
if (count($neighbors) !== 0)
$neighbors[$cur_node] = [$start_idx, count($edges)];
}

static function max_index_plus_one ($pairs): int {
if (count($pairs) === 0)
{
return 0;
}
else {
$max_idx = 0;
for ($i = 0; $i < count($pairs); ++$i)
{
if ($pairs[$i][0] > $max_idx)
$max_idx = $pairs[$i][0];
if ($pairs[$i][1] > $max_idx)
$max_idx = $pairs[$i][1];
}
return $max_idx + 1;
}
}

static function convert_unordered_to_ordered (&$edges, &$out_edges)
{
$out_edges = [];
for ($i = 0; $i < count($edges); ++$i)
{
$out_edges[] = [$edges[$i][0], $edges[$i][1]];
if ($edges[$i][0] != $edges[$i][1])
$out_edges[] = [$edges[$i][1], $edges[$i][0]];
}
}
}
4 changes: 0 additions & 4 deletions lib/Model/ExternalModel/ExternalModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,6 @@ public function isInstalled(): bool {
}

public function meetDependencies(string &$error_message): bool {
if (!extension_loaded('pdlib')) {
$error_message = "The PDlib PHP extension is not loaded.";
return false;
}
if (is_null($this->settingsService->getExternalModelUrl())) {
$error_message = "You still need to configure the URL of the service running the model.";
return false;
Expand Down

0 comments on commit 5574532

Please sign in to comment.