Skip to content

Commit

Permalink
Merge pull request #36 from microsoft/gegeo/image_retrieval_metrics
Browse files Browse the repository at this point in the history
Recall@k, Precision@k, Mean Average Precision@k, Precision-Recall Curve for image retrieval
  • Loading branch information
gegeo0 authored Oct 28, 2022
2 parents 2ca15e3 + 8259247 commit 958e4a9
Show file tree
Hide file tree
Showing 6 changed files with 338 additions and 23 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ This repo currently offers evaluation metrics for three vision tasks:
- `L1ErrorEvaluator`: computes the L1 error.
- **Image regression**:
- `MeanLpErrorEvaluator`: computes the mean Lp error (e.g. L1 error for p=1, L2 error for p=2, etc.).
- **Image retrieval**:
- `RecallAtKEvaluator(k)`: computes Recall@k, which is the percentage of relevant items in top-k among all relevant items
- `PrecisionAtKEvaluator(k)`: computes Precision@k, which is the percentage of TP among all items classified as P in top-k.
- `MeanAveragePrecisionAtK(k)`: computes [Mean Average Precision@k](https://stackoverflow.com/questions/54966320/mapk-computation), an information retrieval metric.
- `PrecisionRecallCurveNPointsEvaluator(k)`: computes a Precision-Recall Curve, interpolated at k points and averaged over all samples.

While different machine learning problems/applications prefer different metrics, below are some general recommendations:

Expand All @@ -51,6 +56,7 @@ While different machine learning problems/applications prefer different metrics,
- **Image caption**: Bleu, METEOR, ROUGE-L, CIDEr, SPICE
- **Image matting**: Mean IOU, Foreground IOU, Boundary mean IOU, Boundary Foreground IOU, L1 Error
- **Image regression**: Mean L1 Error, Mean L2 Error
- **Image retrieval**: Recall@k, Precision@k, Mean Average Precision@k, Precision-Recall Curve

## Additional Requirements

Expand Down
7 changes: 4 additions & 3 deletions test/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from PIL import Image

from vision_evaluation.evaluators import AveragePrecisionEvaluator, F1ScoreEvaluator, TopKAccuracyEvaluator, ThresholdAccuracyEvaluator, MeanAveragePrecisionEvaluatorForSingleIOU, EceLossEvaluator, \
PrecisionEvaluator, RecallEvaluator, TagWiseAccuracyEvaluator, TagWiseAveragePrecisionEvaluator, MeanAveragePrecisionNPointsEvaluator, BalancedAccuracyScoreEvaluator, \
CocoMeanAveragePrecisionEvaluator, BleuScoreEvaluator, METEORScoreEvaluator, ROUGELScoreEvaluator, CIDErScoreEvaluator, SPICEScoreEvaluator, RocAucEvaluator, MeanIOUEvaluator, \
ForegroundIOUEvaluator, BoundaryMeanIOUEvaluator, BoundaryForegroundIOUEvaluator, L1ErrorEvaluator, GroupWiseEvaluator, MeanLpErrorEvaluator
PrecisionEvaluator, RecallEvaluator, TagWiseAccuracyEvaluator, TagWiseAveragePrecisionEvaluator, \
MeanAveragePrecisionNPointsEvaluator, BalancedAccuracyScoreEvaluator, CocoMeanAveragePrecisionEvaluator, BleuScoreEvaluator, METEORScoreEvaluator, \
ROUGELScoreEvaluator, CIDErScoreEvaluator, SPICEScoreEvaluator, RocAucEvaluator, MeanIOUEvaluator, ForegroundIOUEvaluator, BoundaryMeanIOUEvaluator, BoundaryForegroundIOUEvaluator, \
L1ErrorEvaluator, GroupWiseEvaluator, MeanLpErrorEvaluator
from vision_evaluation.prediction_filters import TopKPredictionFilter, ThresholdPredictionFilter


Expand Down
133 changes: 133 additions & 0 deletions test/test_retrieval_evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import unittest
import numpy as np

from vision_evaluation.retrieval_evaluators import MeanAveragePrecisionAtK, PrecisionAtKEvaluator, PrecisionRecallCurveNPointsEvaluator, RecallAtKEvaluator


class TestInformationRetrievalMetrics(unittest.TestCase):
PREDICTIONS = [np.array([[5, 4, 3, 2, 1]]),
np.array([[5, 4, 3, 2, 1]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[5, 4, 3, 2, 1]]),
np.array([[5, 4, 3, 2, 1],
[5, 4, 3, 2, 1]]),
np.array([[5, 4, 3, 2, 1],
[5, 4, 3, 2, 1]]),
np.array([[1]]),
np.array([[2],
[3]])]
TARGETS = [np.array([[1, 1, 0, 0, 1]]),
np.array([[1, 1, 0, 0, 1]]),
np.array([[1, 0, 0, 1, 1]]),
np.array([[0, 0, 0, 0, 1]]),
np.array([[0, 0, 0, 0, 1],
[0, 0, 0, 0, 1]]),
np.array([[1, 0, 0, 0, 1],
[0, 0, 0, 0, 1]]),
np.array([[1]]),
np.array([[1],
[0]])]

def test_recall_at_k(self):
ks = [6, 8, 6, 6, 6, 6, 4, 4]
expectations = [[0, 0.33333, 0.66666, 0.66666, 0.66666, 1.0],
[0, 0.33333, 0.66666, 0.66666, 0.66666, 1.0, 1.0, 1.0],
[0, 0.33333, 0.66666, 0.66666, 0.66666, 1.0],
[0, 0, 0, 0, 0, 1.0],
[0, 0, 0, 0, 0, 1.0],
[0, 0.25, 0.25, 0.25, 0.25, 1.0],
[0, 1.0, 1.0, 1.0],
[0, 0.5, 0.5, 0.5]]
assert len(self.PREDICTIONS) == len(self.TARGETS) == len(ks) == len(expectations)
for preds, tgts, exps, k in zip(self.PREDICTIONS, self.TARGETS, expectations, ks):
for i in range(k):
recall_eval = RecallAtKEvaluator(i)
recall_eval.add_predictions(preds, tgts)
self.assertAlmostEqual(recall_eval.get_report()[f"recall_at_{i}"], exps[i], places=4)

def test_precision_at_k(self):
ks = [6, 8, 6, 6, 6, 6, 4, 4]
expectations = [[0, 1.0, 1.0, 0.66666, 0.5, 0.6],
[0, 1.0, 1.0, 0.66666, 0.5, 0.6, 0.6, 0.6],
[0, 1.0, 1.0, 0.66666, 0.5, 0.6],
[0, 0, 0, 0, 0, 0.2],
[0, 0, 0, 0, 0, 0.2],
[0, 0.5, 0.25, 0.16666, 0.125, 0.3],
[0, 1.0, 1.0, 1.0],
[0, 0.5, 0.5, 0.5]]
assert len(self.PREDICTIONS) == len(self.TARGETS) == len(ks) == len(expectations)
for preds, tgts, exps, k in zip(self.PREDICTIONS, self.TARGETS, expectations, ks):
for i in range(k):
precision_eval = PrecisionAtKEvaluator(i)
precision_eval.add_predictions(preds, tgts)
self.assertAlmostEqual(precision_eval.get_report()[f"precision_at_{i}"], exps[i], places=4)

def test_precision_recall_curve(self):
predictions = [np.array([[5, 4, 3, 2, 1]]),
np.array([[1, 3, 2, 5, 4]]),
np.array([[5, 4, 3, 2, 1]]),
np.array([[5, 4, 3, 2, 1],
[5, 4, 3, 2, 1]])]
targets = [np.array([[0, 0, 0, 0, 1]]),
np.array([[1, 0, 0, 0, 0]]),
np.array([[1, 0, 0, 0, 1]]),
np.array([[0, 0, 0, 0, 1],
[1, 0, 0, 0, 1]])]

expectations_recall = [np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.39999, 0.29999, 0.19999, 0.09999, 0.0]),
np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.39999, 0.29999, 0.19999, 0.09999, 0.0]),
np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.39999, 0.29999, 0.19999, 0.09999, 0.0]),
np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.39999, 0.29999, 0.19999, 0.09999, 0.0])]

expectations_precision = [np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1., ]),
np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1., ]),
np.array([0.4, 0.4, 0.4, 0.4, 0.4, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ]),
np.array([0.3, 0.3, 0.3, 0.3, 0.3, 0.6, 0.6, 0.6, 0.6, 0.6, 1., ])]

assert len(predictions) == len(targets) == len(expectations_recall) == len(expectations_precision)
for preds, tgts, exp_r, exp_p in zip(predictions, targets, expectations_recall, expectations_precision):
n_points = 11
evaluator = PrecisionRecallCurveNPointsEvaluator(n_points)
evaluator.add_predictions(predictions=preds, targets=tgts)
result = evaluator.get_report()
result_recall = result[f"PR_Curve_{n_points}_point_interp"]['recall']
precision_recall = result[f"PR_Curve_{n_points}_point_interp"]['precision']
self.assertAlmostEqual(np.sum(np.abs(result_recall - exp_r)), 0.0, places=4)
self.assertAlmostEqual(np.sum(np.abs(precision_recall - exp_p)), 0.0, places=4)

def test_mean_average_precision_at_k_evaluator(self):
targets = [np.array([[1, 0, 1, 1],
[1, 0, 0, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 0, 1]]),
np.array([[]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[1, 0, 1, 1]]),
np.array([[0, 0, 0, 0]]),
np.array([[1, 0, 1]])]
predictions = [np.array([[5, 4, 3, 2],
[5, 4, 3, 2]]),
np.array([[5, 4, 3, 2]]),
np.array([[5, 4, 3, 2]]),
np.array([[]]),
np.array([[2, 3, 5, 4]]),
np.array([[4, 2, 3, 5]]),
np.array([[4, 2, 3, 5]]),
np.array([[2, 3, 5, 4]]),
np.array([[2, 3, 5, 4]]),
np.array([[2, 3, 5, 4]]),
np.array([[2, 3, 5, 4]]),
np.array([[2, 3, 5]])]
rank = [4, 4, 4, 4, 4, 4, 3, 3, 5, 2, 4, 4]
expectations = [0.77777, 0.80555, 0.75, 0.0, 0.91666, 1.0, 1.0, 0.66666, 0.91666, 1.0, 0.0, 0.83333]

assert len(targets) == len(predictions) == len(rank) == len(expectations)

for preds, tgts, exps, k in zip(predictions, targets, expectations, rank):
evaluator = MeanAveragePrecisionAtK(k)
evaluator.add_predictions(preds, tgts)
self.assertAlmostEqual(evaluator.get_report()[f"map_at_{k}"], exps, places=4)
12 changes: 8 additions & 4 deletions vision_evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from .evaluators import MeanAveragePrecisionEvaluatorForMultipleIOUs, MeanAveragePrecisionEvaluatorForSingleIOU, TopKAccuracyEvaluator, ThresholdAccuracyEvaluator, PrecisionEvaluator, \
RecallEvaluator, AveragePrecisionEvaluator, EceLossEvaluator, F1ScoreEvaluator, RocAucEvaluator, Evaluator, MemorizingEverythingEvaluator, EvaluatorAggregator, TagWiseAveragePrecisionEvaluator, \
TagWiseAccuracyEvaluator, MeanAveragePrecisionNPointsEvaluator, BalancedAccuracyScoreEvaluator, CocoMeanAveragePrecisionEvaluator, BleuScoreEvaluator, METEORScoreEvaluator, \
RecallEvaluator, AveragePrecisionEvaluator, EceLossEvaluator, F1ScoreEvaluator, RocAucEvaluator, Evaluator, MemorizingEverythingEvaluator, EvaluatorAggregator, \
TagWiseAveragePrecisionEvaluator, TagWiseAccuracyEvaluator, MeanAveragePrecisionNPointsEvaluator, \
BalancedAccuracyScoreEvaluator, CocoMeanAveragePrecisionEvaluator, BleuScoreEvaluator, METEORScoreEvaluator, \
ROUGELScoreEvaluator, CIDErScoreEvaluator, SPICEScoreEvaluator, MeanIOUEvaluator, ForegroundIOUEvaluator, BoundaryMeanIOUEvaluator, BoundaryForegroundIOUEvaluator, L1ErrorEvaluator, \
GroupWiseEvaluator

from .retrieval_evaluators import MeanAveragePrecisionAtK, PrecisionAtKEvaluator, PrecisionRecallCurveNPointsEvaluator, RecallAtKEvaluator

__all__ = ['MeanAveragePrecisionEvaluatorForMultipleIOUs', 'MeanAveragePrecisionEvaluatorForSingleIOU', 'TopKAccuracyEvaluator', 'ThresholdAccuracyEvaluator', 'PrecisionEvaluator', 'RecallEvaluator',
__all__ = ['MeanAveragePrecisionEvaluatorForMultipleIOUs', 'MeanAveragePrecisionEvaluatorForSingleIOU', 'TopKAccuracyEvaluator',
'ThresholdAccuracyEvaluator', 'PrecisionEvaluator', 'PrecisionAtKEvaluator', 'MeanAveragePrecisionAtK', 'RecallEvaluator', 'RecallAtKEvaluator',
"AveragePrecisionEvaluator", "EceLossEvaluator", 'F1ScoreEvaluator', 'RocAucEvaluator', 'Evaluator', 'MemorizingEverythingEvaluator', 'EvaluatorAggregator', 'TagWiseAccuracyEvaluator',
'TagWiseAveragePrecisionEvaluator', 'MeanAveragePrecisionNPointsEvaluator', 'BalancedAccuracyScoreEvaluator', 'CocoMeanAveragePrecisionEvaluator', 'BleuScoreEvaluator',
'TagWiseAveragePrecisionEvaluator', 'MeanAveragePrecisionNPointsEvaluator',
'PrecisionRecallCurveNPointsEvaluator', 'BalancedAccuracyScoreEvaluator', 'CocoMeanAveragePrecisionEvaluator', 'BleuScoreEvaluator',
'METEORScoreEvaluator', 'ROUGELScoreEvaluator', 'CIDErScoreEvaluator', 'SPICEScoreEvaluator', 'MeanIOUEvaluator', 'ForegroundIOUEvaluator', 'BoundaryForegroundIOUEvaluator',
'BoundaryMeanIOUEvaluator', 'L1ErrorEvaluator', 'GroupWiseEvaluator']
44 changes: 28 additions & 16 deletions vision_evaluation/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def add_predictions(self, predictions, targets):
else:
self.all_targets = targets.copy()

def calculate_score(self, average='macro'):
def calculate_score(self, average='macro', filter_out_zero_tgt=True):
"""
average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
If ``None``, the scores for each class are returned. Otherwise,
Expand All @@ -126,6 +126,8 @@ def calculate_score(self, average='macro'):
by support (the number of true instances for each label).
``'samples'``:
Calculate metrics for each instance, and find their average.
filter_out_zero_tgt : bool
Removes target columns that are all zero. For precision calculations this needs to be set to False, otherwise we could be removing FP
"""
if self.all_predictions.size == 0:
return 0.0
Expand All @@ -134,7 +136,7 @@ def calculate_score(self, average='macro'):
assert tar_mat.size == self.all_predictions.size
result = 0.0
if tar_mat.size > 0:
non_empty_idx = np.where(np.invert(np.all(tar_mat == 0, axis=0)))[0]
non_empty_idx = np.where(np.invert(np.all(tar_mat == 0, axis=0)))[0] if filter_out_zero_tgt else np.arange(tar_mat.shape[1])
if non_empty_idx.size != 0:
result = self._calculate(tar_mat[:, non_empty_idx], self.all_predictions[:, non_empty_idx], average=average)

Expand Down Expand Up @@ -266,6 +268,10 @@ def _get_id(self):
def _calculate(self, targets, predictions, average):
return sm.precision_score(targets, predictions, average=average)

def get_report(self, **kwargs):
average = kwargs.get('average', 'macro')
return {self._get_id(): self.calculate_score(average=average, filter_out_zero_tgt=False)}


class RecallEvaluator(MemorizingEverythingEvaluator):
"""
Expand Down Expand Up @@ -295,7 +301,7 @@ def _get_id(self):

def calculate_score(self, average='macro'):
if average != 'macro':
return super().calculate_score(average)
return super().calculate_score(average=average)

ap = 0.0
if self.all_targets.size == 0:
Expand Down Expand Up @@ -751,40 +757,46 @@ def _get_id(self):
return 'balanced_accuracy'


class MeanAveragePrecisionNPointsEvaluator(MemorizingEverythingEvaluator):
class PrecisionRecallCurveMixin():
"""
N-point interpolated average precision, averaged over classes
N-point interpolated precision-recall curve, averaged over samples
"""

def __init__(self, n_points=11):
super().__init__()
self.ap_n_points_eval = []
self.n_points = n_points

def _calculate(self, targets, predictions, average):
n_class = predictions.shape[1]
return np.mean([self._per_class_calc(predictions[:, i], targets[:, i]) for i in range(n_class)])

def _per_class_calc(self, predictions, targets):
def _calc_precision_recall_interp(self, predictions, targets, recall_thresholds):
""" Evaluate a batch of predictions.
Args:
predictions: the probability of the data to be 'positive'. Shape (N,)
predictions: the probability or score of the data to be 'positive'. Shape (N,)
targets: the binary ground truths in {0, 1} or {-1, 1}. Shape (N,)
"""
assert len(predictions) == len(targets)
assert len(targets.shape) == 1

precision, recall, _ = sm.precision_recall_curve(targets, predictions)
recall_thresholds = np.linspace(1, 0, self.n_points, endpoint=True).tolist()
precision_sum = 0
precision_interp = np.empty(len(recall_thresholds))
recall_idx = 0
precision_tmp = 0
for threshold in recall_thresholds:
for idx, threshold in enumerate(recall_thresholds):
while recall_idx < len(recall) and threshold <= recall[recall_idx]:
precision_tmp = max(precision_tmp, precision[recall_idx])
recall_idx += 1
precision_sum += precision_tmp
return precision_sum / self.n_points
precision_interp[idx] = precision_tmp
return precision_interp


class MeanAveragePrecisionNPointsEvaluator(PrecisionRecallCurveMixin, MemorizingEverythingEvaluator):
"""
N-point interpolated average precision, averaged over classes
"""

def _calculate(self, targets, predictions, average):
n_class = predictions.shape[1]
recall_thresholds = np.linspace(1, 0, self.n_points, endpoint=True).tolist()
return np.mean([np.mean(self._calc_precision_recall_interp(predictions[:, i], targets[:, i], recall_thresholds)) for i in range(n_class)])

def _get_id(self):
return f'mAP_{self.n_points}_points'
Expand Down
Loading

0 comments on commit 958e4a9

Please sign in to comment.