Skip to content

Commit

Permalink
Add Sampler Plugin (cvat-ai#115)
Browse files Browse the repository at this point in the history
* Added sampler 

* update CHANGELOG.md

* update documentation

* Adding pandas update requirements.txt
  • Loading branch information
Harim Kang authored Mar 2, 2021
1 parent 1a13031 commit 3bbf056
Show file tree
Hide file tree
Showing 11 changed files with 1,611 additions and 0 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ matrix:
install:
- pip install -e ./
- pip install tensorflow
- pip install pandas

script:
- python -m unittest discover -v
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `LFW` dataset format (<https://github.com/openvinotoolkit/datumaro/pull/110>)
- Support of polygons' and masks' confusion matrices and mismathing classes in `diff` command (<https://github.com/openvinotoolkit/datumaro/pull/117>)
- Add near duplicate image removal plugin (<https://github.com/openvinotoolkit/datumaro/pull/113>)
- Sampler Plugin that analyzes inference result from the given dataset and selects samples for annotation(<https://github.com/openvinotoolkit/datumaro/pull/115>)

### Changed
- OpenVINO model launcher is updated for OpenVINO r2021.1 (<https://github.com/openvinotoolkit/datumaro/pull/100>)
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ CVAT annotations ---> Publication, statistics etc.
- for detection task, based on bboxes
- for re-identification task, based on labels,
avoiding having same IDs in training and test splits
- Sampling a dataset
- analyzes inference result from the given dataset
and selects the ‘best’ and the ‘least amount of’ samples for annotation.
- Select the sample that best suits model training.
- sampling with Entropy based algorithm
- Dataset quality checking
- Simple checking for errors
- Comparison with model infernece
Expand Down
3 changes: 3 additions & 0 deletions datumaro/plugins/sampler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
22 changes: 22 additions & 0 deletions datumaro/plugins/sampler/algorithm/algorithm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

from enum import Enum

SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"])
Algorithm = Enum("Algorithm", ["entropy"])


class InferenceResultAnalyzer:
"""
Basic interface for IRA (Inference Result Analyzer)
"""

def __init__(self, dataset, inference):
self.data = dataset
self.inference = inference
self.sampling_method = SamplingMethod

def get_sample(self, method: str, k: int):
raise NotImplementedError()
191 changes: 191 additions & 0 deletions datumaro/plugins/sampler/algorithm/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Copyright (C) 2021 Intel Corporation
#
# SPDX-License-Identifier: MIT

import pandas as pd
import math
import re
import logging as log

from .algorithm import InferenceResultAnalyzer


class SampleEntropy(InferenceResultAnalyzer):
"""
Entropy is a class that inherits an Sampler,
calculates an uncertainty score based on an entropy,
and get samples based on that score.
"""

def __init__(self, data, inference):
"""
Constructor function
Args:
data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data.
inference:
Receive the inference format in the form of pd.DataFrame.
ImageID and ClassProbability are essential for inferences.
"""
super().__init__(data, inference)

# check the existence of "ImageID" in data & inference
if "ImageID" not in data:
raise Exception("Invalid Data, ImageID not found in data")
if "ImageID" not in inference:
raise Exception("Invalid Data, ImageID not found in inference")

# check the existence of "ClassProbability" in inference
self.num_classes = 0
for head in list(inference):
m = re.match("ClassProbability\d+", head)
if m is not None:
self.num_classes += 1

if not self.num_classes > 0:
raise Exception(
"Invalid data, Inference do not have ClassProbability values!"
)

# rank: The inference DataFrame, sorted according to the score.
self.rank = self._rank_images().sort_values(by="rank")

def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'topk': It extracts the k sample data with the highest uncertainty.
- 'lowk': It extracts the k sample data with the lowest uncertainty.
- 'randomk': Extract and return random k sample data.
k: number of sample data
n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k.
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# 1. k value check
if not isinstance(k, int):
raise Exception(
f"Invalid value {k}. k must have an integer greater than zero."
)
elif k <= 0:
raise Exception(
f"Invalid number {k}. k must have a positive number greater than zero."
)

# 2. Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.topk.name:
temp_rank = temp_rank[:k]
elif method == self.sampling_method.lowk.name:
temp_rank = temp_rank[-k:]
elif method == self.sampling_method.randk.name:
return self.data.sample(n=k).reset_index(drop=True)
elif method in [
self.sampling_method.mixk.name,
self.sampling_method.randtopk.name,
]:
return self._get_sample_mixed(method=method, k=k, n=n)
else:
raise Exception(f"Not Found method '{method}'")
else:
log.warning(
"The number of samples is greater than the size of the selected subset."
)

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame:
"""
A function that extracts sample data and returns it.
Args:
method:
- 'mixk': Return top-k and low-k halves based on uncertainty.
- 'randomtopk': Randomly extract n*k and return k with high uncertainty.
k: number of sample data
n: Number to extract n*k from total data according to n, and top-k from it
Returns:
Extracted sample data : pd.DataFrame
"""
temp_rank = self.rank

# Select a sample according to the method
if k <= len(temp_rank):
if method == self.sampling_method.mixk.name:
if k % 2 == 0:
temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]])
else:
temp_rank = pd.concat(
[temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]]
)
elif method == self.sampling_method.randtopk.name:
if n * k <= len(temp_rank):
temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank")
else:
log.warning(msg="n * k exceeds the length of the inference")
temp_rank = temp_rank[:k]

columns = list(self.data.columns)
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"])
return merged_df[columns].reset_index(drop=True)

def _rank_images(self) -> pd.DataFrame:
"""
A internal function that ranks the inference data based on uncertainty.
Returns:
inference data sorted by uncertainty. pd.DataFrame
"""
# 1. Load Inference
inference, res = None, None
if self.inference is not None:
inference = pd.DataFrame(self.inference)
else:
raise Exception("Invalid Data, Failed to load inference result!")

# 2. If the reference data frame does not contain an uncertify score, calculate it
if "Uncertainty" not in inference:
inference = self._calculate_uncertainty_from_classprob(inference=inference)

# 3. Check that Uncertainty values are in place.
na_df = inference.isna().sum()
if "Uncertainty" in na_df and na_df["Uncertainty"] > 0:
raise Exception("Some inference results do not have Uncertainty values!")

# 4. Ranked based on Uncertainty score
res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean()
res["rank"] = res["Uncertainty"].rank(ascending=False, method="first")
res = res.reset_index()

return res

def _calculate_uncertainty_from_classprob(
self, inference: pd.DataFrame
) -> pd.DataFrame:
"""
A function that calculates uncertainty based on entropy through ClassProbability values.
Args:
inference: Inference data where uncertainty has not been calculated
Returns:
inference data with uncertainty variable
"""

# Calculate Entropy (Uncertainty Score)
uncertainty = []
for i in range(len(inference)):
entropy = 0
for j in range(self.num_classes):
p = inference.loc[i][f"ClassProbability{j+1}"]
if p < 0 or p > 1:
raise Exception(
"Invalid data, Math domain Error! p is between 0 and 1"
)
entropy -= p * math.log(p + 1e-14, math.e)

uncertainty.append(entropy)

inference["Uncertainty"] = uncertainty

return inference
Loading

0 comments on commit 3bbf056

Please sign in to comment.