forked from cvat-ai/cvat
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added sampler * update CHANGELOG.md * update documentation * Adding pandas update requirements.txt
- Loading branch information
Harim Kang
authored
Mar 2, 2021
1 parent
1a13031
commit 3bbf056
Showing
11 changed files
with
1,611 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Copyright (C) 2021 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Copyright (C) 2021 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
from enum import Enum | ||
|
||
SamplingMethod = Enum("SamplingMethod", ["topk", "lowk", "randk", "mixk", "randtopk"]) | ||
Algorithm = Enum("Algorithm", ["entropy"]) | ||
|
||
|
||
class InferenceResultAnalyzer: | ||
""" | ||
Basic interface for IRA (Inference Result Analyzer) | ||
""" | ||
|
||
def __init__(self, dataset, inference): | ||
self.data = dataset | ||
self.inference = inference | ||
self.sampling_method = SamplingMethod | ||
|
||
def get_sample(self, method: str, k: int): | ||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
# Copyright (C) 2021 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
import pandas as pd | ||
import math | ||
import re | ||
import logging as log | ||
|
||
from .algorithm import InferenceResultAnalyzer | ||
|
||
|
||
class SampleEntropy(InferenceResultAnalyzer): | ||
""" | ||
Entropy is a class that inherits an Sampler, | ||
calculates an uncertainty score based on an entropy, | ||
and get samples based on that score. | ||
""" | ||
|
||
def __init__(self, data, inference): | ||
""" | ||
Constructor function | ||
Args: | ||
data: Receive the data format in pd.DataFrame format. ImageID is an essential element for data. | ||
inference: | ||
Receive the inference format in the form of pd.DataFrame. | ||
ImageID and ClassProbability are essential for inferences. | ||
""" | ||
super().__init__(data, inference) | ||
|
||
# check the existence of "ImageID" in data & inference | ||
if "ImageID" not in data: | ||
raise Exception("Invalid Data, ImageID not found in data") | ||
if "ImageID" not in inference: | ||
raise Exception("Invalid Data, ImageID not found in inference") | ||
|
||
# check the existence of "ClassProbability" in inference | ||
self.num_classes = 0 | ||
for head in list(inference): | ||
m = re.match("ClassProbability\d+", head) | ||
if m is not None: | ||
self.num_classes += 1 | ||
|
||
if not self.num_classes > 0: | ||
raise Exception( | ||
"Invalid data, Inference do not have ClassProbability values!" | ||
) | ||
|
||
# rank: The inference DataFrame, sorted according to the score. | ||
self.rank = self._rank_images().sort_values(by="rank") | ||
|
||
def get_sample(self, method: str, k: int, n: int = 3) -> pd.DataFrame: | ||
""" | ||
A function that extracts sample data and returns it. | ||
Args: | ||
method: | ||
- 'topk': It extracts the k sample data with the highest uncertainty. | ||
- 'lowk': It extracts the k sample data with the lowest uncertainty. | ||
- 'randomk': Extract and return random k sample data. | ||
k: number of sample data | ||
n: Parameters to be used in the randtopk method, Variable to first extract data of multiple n of k. | ||
Returns: | ||
Extracted sample data : pd.DataFrame | ||
""" | ||
temp_rank = self.rank | ||
|
||
# 1. k value check | ||
if not isinstance(k, int): | ||
raise Exception( | ||
f"Invalid value {k}. k must have an integer greater than zero." | ||
) | ||
elif k <= 0: | ||
raise Exception( | ||
f"Invalid number {k}. k must have a positive number greater than zero." | ||
) | ||
|
||
# 2. Select a sample according to the method | ||
if k <= len(temp_rank): | ||
if method == self.sampling_method.topk.name: | ||
temp_rank = temp_rank[:k] | ||
elif method == self.sampling_method.lowk.name: | ||
temp_rank = temp_rank[-k:] | ||
elif method == self.sampling_method.randk.name: | ||
return self.data.sample(n=k).reset_index(drop=True) | ||
elif method in [ | ||
self.sampling_method.mixk.name, | ||
self.sampling_method.randtopk.name, | ||
]: | ||
return self._get_sample_mixed(method=method, k=k, n=n) | ||
else: | ||
raise Exception(f"Not Found method '{method}'") | ||
else: | ||
log.warning( | ||
"The number of samples is greater than the size of the selected subset." | ||
) | ||
|
||
columns = list(self.data.columns) | ||
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) | ||
return merged_df[columns].reset_index(drop=True) | ||
|
||
def _get_sample_mixed(self, method: str, k: int, n: int = 3) -> pd.DataFrame: | ||
""" | ||
A function that extracts sample data and returns it. | ||
Args: | ||
method: | ||
- 'mixk': Return top-k and low-k halves based on uncertainty. | ||
- 'randomtopk': Randomly extract n*k and return k with high uncertainty. | ||
k: number of sample data | ||
n: Number to extract n*k from total data according to n, and top-k from it | ||
Returns: | ||
Extracted sample data : pd.DataFrame | ||
""" | ||
temp_rank = self.rank | ||
|
||
# Select a sample according to the method | ||
if k <= len(temp_rank): | ||
if method == self.sampling_method.mixk.name: | ||
if k % 2 == 0: | ||
temp_rank = pd.concat([temp_rank[: k // 2], temp_rank[-(k // 2) :]]) | ||
else: | ||
temp_rank = pd.concat( | ||
[temp_rank[: (k // 2) + 1], temp_rank[-(k // 2) :]] | ||
) | ||
elif method == self.sampling_method.randtopk.name: | ||
if n * k <= len(temp_rank): | ||
temp_rank = temp_rank.sample(n=n * k).sort_values(by="rank") | ||
else: | ||
log.warning(msg="n * k exceeds the length of the inference") | ||
temp_rank = temp_rank[:k] | ||
|
||
columns = list(self.data.columns) | ||
merged_df = pd.merge(temp_rank, self.data, how="inner", on=["ImageID"]) | ||
return merged_df[columns].reset_index(drop=True) | ||
|
||
def _rank_images(self) -> pd.DataFrame: | ||
""" | ||
A internal function that ranks the inference data based on uncertainty. | ||
Returns: | ||
inference data sorted by uncertainty. pd.DataFrame | ||
""" | ||
# 1. Load Inference | ||
inference, res = None, None | ||
if self.inference is not None: | ||
inference = pd.DataFrame(self.inference) | ||
else: | ||
raise Exception("Invalid Data, Failed to load inference result!") | ||
|
||
# 2. If the reference data frame does not contain an uncertify score, calculate it | ||
if "Uncertainty" not in inference: | ||
inference = self._calculate_uncertainty_from_classprob(inference=inference) | ||
|
||
# 3. Check that Uncertainty values are in place. | ||
na_df = inference.isna().sum() | ||
if "Uncertainty" in na_df and na_df["Uncertainty"] > 0: | ||
raise Exception("Some inference results do not have Uncertainty values!") | ||
|
||
# 4. Ranked based on Uncertainty score | ||
res = inference[["ImageID", "Uncertainty"]].groupby("ImageID").mean() | ||
res["rank"] = res["Uncertainty"].rank(ascending=False, method="first") | ||
res = res.reset_index() | ||
|
||
return res | ||
|
||
def _calculate_uncertainty_from_classprob( | ||
self, inference: pd.DataFrame | ||
) -> pd.DataFrame: | ||
""" | ||
A function that calculates uncertainty based on entropy through ClassProbability values. | ||
Args: | ||
inference: Inference data where uncertainty has not been calculated | ||
Returns: | ||
inference data with uncertainty variable | ||
""" | ||
|
||
# Calculate Entropy (Uncertainty Score) | ||
uncertainty = [] | ||
for i in range(len(inference)): | ||
entropy = 0 | ||
for j in range(self.num_classes): | ||
p = inference.loc[i][f"ClassProbability{j+1}"] | ||
if p < 0 or p > 1: | ||
raise Exception( | ||
"Invalid data, Math domain Error! p is between 0 and 1" | ||
) | ||
entropy -= p * math.log(p + 1e-14, math.e) | ||
|
||
uncertainty.append(entropy) | ||
|
||
inference["Uncertainty"] = uncertainty | ||
|
||
return inference |
Oops, something went wrong.