From 78887a91508d9236f1c338b5ef51818aaf381f81 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Fri, 12 Mar 2021 22:06:36 +0100 Subject: [PATCH 1/7] file renamed --- test/{test_ranking.py => test_rank.py} | 6 ++++++ 1 file changed, 6 insertions(+) rename test/{test_ranking.py => test_rank.py} (85%) diff --git a/test/test_ranking.py b/test/test_rank.py similarity index 85% rename from test/test_ranking.py rename to test/test_rank.py index 83040f3..b36c7d3 100644 --- a/test/test_ranking.py +++ b/test/test_rank.py @@ -26,6 +26,12 @@ def test1(): {"method": "pvalue", "avg": "exist", "calibration": "isotonic"}, {"method": "pvalue", "avg": "all", "calibration": "minmax"}, {"method": "pvalue", "avg": "exist", "calibration": "minmax"}, + {"method": "btl", "calibration": "platt"}, + {"method": "btl", "calibration": "isotonic"}, + {"method": "btl", "calibration": "minmax"}, + {"method": "orme", "calibration": "platt"}, + {"method": "orme", "calibration": "isotonic"}, + {"method": "orme", "calibration": "minmax"}, {"method": "eigen", "calibration": "platt"}, {"method": "eigen", "calibration": "isotonic"}, {"method": "eigen", "calibration": "minmax"}, From d5d461349164379705718ad47e6fed51027915b2 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Fri, 12 Mar 2021 22:28:40 +0100 Subject: [PATCH 2/7] references added --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index 91b578f..d1d9557 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,10 @@ The output has the following structure **Warning**: `len(examples)` must be a multiple of `(n_items - 1)` +**References:** + +- Section 5 (page 4) in: Hamster, U. A. (2021, March 9). Extracting Pairwise Comparisons Data from Best-Worst Scaling Surveys by Logical Inference. [https://doi.org/10.31219/osf.io/qkxej](https://doi.org/10.31219/osf.io/qkxej) + ## Counting **Input Data:** @@ -125,10 +129,28 @@ agg_dok, direct_dok, direct_detail, logical_dok, logical_detail = bws.count( logical_dok=logical_dok, logical_detail=logical_detail, logical_database=database) ``` +**References:** + +- Section 3-4 in: Hamster, U. A. (2021, March 9). Extracting Pairwise Comparisons Data from Best-Worst Scaling Surveys by Logical Inference. [https://doi.org/10.31219/osf.io/qkxej](https://doi.org/10.31219/osf.io/qkxej) + ## Ranking +**Input Data:** +The input data is a Dictionary of Keys (DoK) object produced by `bwsample.count`. + +**Call the function:** +The function `bwsample.rank` computes a python index variable with a proposed ordering (`ranked`), and ordered list of example IDs (`ordids`), scores (`scores`) and further information depending on the selected `method`. + +```python +import bwsample as bws +ranked, ordids, scores, info = bws.ranking(dok, method='ratio') +``` +**References:** +- Eigenvector solution in: Saaty, T. L. (2003). Decision-making with the AHP: Why is the principal eigenvector nec- essary. European Journal of Operational Research, 145(1), 85–91. [https://doi.org/10.1016/S0377-2217(02)00227-8](https://doi.org/10.1016/S0377-2217(02)00227-8) +- Estimating the BTL model in: Hunter, D. R. (2004). MM algorithms for generalized Bradley-Terry models. The Annals of Statistics, 32(1), 384–406. [https://doi.org/10.1214/aos/1079120141](https://doi.org/10.1214/aos/1079120141) +- MaxDiff score in: Orme, B. (2009). MaxDiff Analysis: Simple Counting, Individual-Level Logit, and HB. [https://sawtoothsoftware.com/uploads/sawtoothsoftware/originals/f89a6537-1cae-4fb5-afad-9d325c2a3143.pdf](https://sawtoothsoftware.com/uploads/sawtoothsoftware/originals/f89a6537-1cae-4fb5-afad-9d325c2a3143.pdf) ## Appendix From 532f66d5392e54eb3915eb2f3b999aa02284f757 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Sat, 13 Mar 2021 18:11:10 +0100 Subject: [PATCH 3/7] ranking.py refactored --- bwsample/__init__.py | 5 +- bwsample/ranking.py | 490 +++++++++++++++------------------------ bwsample/utils.py | 70 +++++- test/test_adjustscore.py | 41 ++++ test/test_rank.py | 56 +++-- 5 files changed, 318 insertions(+), 344 deletions(-) create mode 100644 test/test_adjustscore.py diff --git a/bwsample/__init__.py b/bwsample/__init__.py index 648675a..794bffa 100644 --- a/bwsample/__init__.py +++ b/bwsample/__init__.py @@ -2,6 +2,5 @@ from .sampling import sample from .counting import count -# logical_infer -from .utils import (to_scipy, add_dok) -from .ranking import (rank) +from .ranking import rank +from .utils import (to_scipy, add_dok, adjustscore) diff --git a/bwsample/ranking.py b/bwsample/ranking.py index 535baf6..cef25aa 100644 --- a/bwsample/ranking.py +++ b/bwsample/ranking.py @@ -1,74 +1,18 @@ from typing import List, Dict, Tuple, Optional from .utils import to_scipy +from .utils import adjustscore import numpy as np import scipy.sparse import scipy.sparse.linalg import scipy.linalg import scipy.stats -import sklearn.linear_model -import sklearn.isotonic - - -def minmax(arr: np.array) -> np.array: - data = np.array(arr) - xmin = data.min() - xmax = data.max() - return (data - xmin) / (xmax - xmin) - - -def calibrate(scores: np.array, - labels: np.array, - method: Optional[str] = None) -> np.array: - """Wrapper function to calibrate scores with its binary labels - - Parameters: - ----------- - scores: np.array - The scores generated by a model. It's assumed that these scores - are probabilities with values between [0,1]. For example, apply - min-max-scaling for ratio-scale data types (i.e. score>0.0). - - labels: np.array - The binary labels that are supposed to be classified by the scores. - - method: str (Default: None) - The calibration algorithm: - - 'platt' for Platt-Scaling (Platt, 1999) - - 'isotonic' for Isotonic Regression (Zadrozny and Elkan, 2002) - - Return: - ------- - calibrated_scores : np.array - The predicted probabilities - - References: - ----------- - Platt, J., 1999. Probabilistic outputs for support vector machines and - comparisons to regularized likelihood methods. - - Zadrozny, B., Elkan, C., 2002. Transforming classifier scores into - accurate multiclass probability estimates, in: Proceedings of the - Eighth ACM SIGKDD International Conference on Knowledge Discovery - and Data Mining, KDD ’02. Association for Computing Machinery, - New York, NY, USA, pp. 694–699. https://doi.org/10.1145/775047.775151 - """ - scores = np.array(scores) - labels = np.array(labels) - if method == 'platt': - cls = sklearn.linear_model.LogisticRegression() - cls.fit(X=scores.reshape(-1, 1), y=labels) - return cls.predict_proba(scores.reshape(-1, 1))[:, 1] - elif method == 'isotonic': - cls = sklearn.isotonic.IsotonicRegression(out_of_bounds='raise') - cls.fit(X=scores, y=labels) - return cls.transform(scores) - else: - return scores def rank(dok: Dict[Tuple[str, str], int], - method: Optional[str] = 'pvalue', **kwargs): - """Rank and score items based on pairwise comparison frequencies + method: Optional[str] = 'ratio', + adjust: Optional[str] = None, + **kwargs) -> (np.array, np.array, np.array, dict): + """Rank items based on pairwise comparison frequencies Parameters: ----------- @@ -77,61 +21,80 @@ def rank(dok: Dict[Tuple[str, str], int], method : Optional[str] The procedure to compute ranks and scores. - - 'ratios' - - 'btl' + - 'ratio' - 'pvalue' - - 'orme' + - 'btl' - 'eigen' - - 'transition' + - 'trans' Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs + + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - scores : List[float] - The scores for each item ID. Also sorted in descending order. + scores : np.array[float] + Scaled or calibrated metrics (Default: The `metrics` if `adjust=None`) - info - Output depends on the selected method + info : dict + Further information depending on the selected `method` Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, info = bws.ranking(dok, method='pvalue') + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='ratio', avg='exist', adjust='ordinal') """ + # convert to sparse matrix cnt, indices = to_scipy(dok) - if method in ('ratios'): - return ranking_maximize_ratios(cnt, indices, **kwargs) + + # compute the rankings + if method in ('ratio'): + positions, sortedids, metrics, info = maximize_ratio( + cnt, indices, **kwargs) elif method in ('pvalue'): - return ranking_minus_pvalues(cnt, indices, **kwargs) - elif method in ('btl', 'bradley', 'hunter'): - return ranking_btl(cnt, indices, **kwargs) - elif method in ('orme'): - return scoring_orme(cnt, indices, **kwargs) - elif method in ('eigen'): - return scoring_eigenvector(cnt, indices, **kwargs) - elif method in ('transition'): - return transition_simulation(cnt, indices, **kwargs) + positions, sortedids, metrics, info = maximize_minuspvalue( + cnt, indices, **kwargs) + elif method in ('btl', 'hunter'): + positions, sortedids, metrics, info = bradley_terry_probability( + cnt, indices, **kwargs) + elif method in ('eigen', 'saaty'): + positions, sortedids, metrics, info = eigenvector_estimation( + cnt, indices, **kwargs) + elif method in ('trans'): + positions, sortedids, metrics, info = transition_simulation( + cnt, indices, **kwargs) else: raise Exception(f"method='{method}' not available.") + # adjust scores + if adjust is not None: + cut = np.median(metrics) + labels = [x >= cut for x in metrics] + scores = adjustscore(metrics, method=adjust, labels=labels) + else: + scores = metrics.copy() + + # done + return positions, sortedids, metrics, scores, info + -def ranking_maximize_ratios(cnt: scipy.sparse.csr_matrix, - indices: List[str], - avg: Optional[str] = 'exist', - calibration: Optional[str] = 'platt'): +def maximize_ratio(cnt: scipy.sparse.csr_matrix, + indices: List[str], + avg: Optional[str] = 'exist'): """Rank items based simple ratios, and calibrate row sums as scores Parameters: @@ -142,42 +105,38 @@ def ranking_maximize_ratios(cnt: scipy.sparse.csr_matrix, indices : List[str] Identifiers, e.g. UUID4, of each row/column of the `cnt` matrix. - avg : Optional[str] + avg : Optional[str] = 'exists' How to compute denominator for averaging. - 'all': divide the sum of ratios by the row length - 'exist': divide the sum of ratios by the number of ratios in the row - calibration: str (Default: None) - The calibrated scores. For 'platt' and 'isotonic' we assume - `label[i]=rowsum[i]>mean(rowsum)`. - Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs - scores : List[float] - The scores for each item ID. Also sorted in descending order. + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - info - The matrix with the ratios + info : dict + Further information depending on the selected `method` Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, ratios = bws.rank( - dok, method='ratio', avg='exist', calibration='platt') + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='ratio', avg='exist') """ # compute ratios cnt = cnt.tocsr() @@ -186,35 +145,30 @@ def ranking_maximize_ratios(cnt: scipy.sparse.csr_matrix, ratios = ratios.multiply(cnt) # sum rows in DoK matrix - metric = np.array(ratios.sum(axis=1).flatten())[0] + metrics = np.array(ratios.sum(axis=1).flatten())[0] # averaging if avg == 'all': - metric /= len(metric) + metrics /= len(metrics) elif avg == 'exist': ridx, _ = (ratios + ratios.T).nonzero() # ensure actual 0s are counted for i, c in zip(*np.unique(ridx, return_counts=True)): - metric[i] /= c + metrics[i] /= c # sort, larger row sums are better - ranked = np.argsort(-metric) # maximize - ordids = np.array(indices)[ranked].tolist() - scores = metric[ranked] + positions = np.argsort(-metrics) # maximize + sortedids = np.array(indices)[positions] + metrics = metrics[positions] - # calibrate scores - if calibration in ('platt', 'isotonic'): - labels = scores > np.mean(scores) # TRUE: s>mean(s) - scores = calibrate(scores, labels, method=calibration) - elif calibration == 'minmax': - scores = minmax(scores) + # informations + info = {} # done - return ranked.tolist(), ordids, scores.tolist(), ratios + return positions, sortedids, metrics, info -def ranking_minus_pvalues(cnt: scipy.sparse.csr_matrix, - indices: List[str], - avg: Optional[str] = 'exist', - calibration: Optional[str] = 'platt'): +def maximize_minuspvalue(cnt: scipy.sparse.csr_matrix, + indices: List[str], + avg: Optional[str] = 'exist'): """Rank based on p-values of a Chi-Squard tests between reciprocal pairs, and calibrate row sums as scores @@ -231,37 +185,34 @@ def ranking_minus_pvalues(cnt: scipy.sparse.csr_matrix, - 'all': divide the sum of ratios by the row length - 'exist': divide the sum of ratios by the number of ratios in the row - calibration: str (Default: None) - The calibrated scores. For 'platt' and 'isotonic' we assume - `label[i]=rowsum[i]>mean(rowsum)`. - Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs - scores : List[float] - The scores for each item ID. Also sorted in descending order. + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - info - The matrix with the `1-p`-values + info : dict + Further information depending on the selected `method`, e.g. + - "P": The matrix with the `1-p`-values Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, (eigval, eigenvec) = bws.rank( - dok, method='pvalue', avg='exist', calibration='platt') + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='pvalue', avg='exist') """ # compute p-values for Nij>Nji or 1 n, _ = cnt.shape @@ -280,104 +231,30 @@ def ranking_minus_pvalues(cnt: scipy.sparse.csr_matrix, P[j, i] = 1 - pval # sum rows in DoK matrix - metric = np.array(P.sum(axis=1).flatten())[0] + metrics = np.array(P.sum(axis=1).flatten())[0] # averaging if avg == 'all': - metric /= len(metric) + metrics /= len(metrics) elif avg == 'exist': ridx, _ = (P + P.T).nonzero() # ensure actual 0s are counted for i, c in zip(*np.unique(ridx, return_counts=True)): - metric[i] /= c + metrics[i] /= c # sort, larger row sums are better - ranked = np.argsort(-metric) # minimize - ordids = np.array(indices)[ranked].tolist() - scores = metric[ranked] + positions = np.argsort(-metrics) # minimize P, maximize 1-P + sortedids = np.array(indices)[positions] + metrics = metrics[positions] - # calibrate scores - if calibration in ('platt', 'isotonic'): - labels = scores > np.mean(scores) # TRUE: s>mean(s) - scores = calibrate(scores, labels, method=calibration) - elif calibration == 'minmax': - scores = minmax(scores) + # informations + info = {} + info["P"] = P # done - return ranked.tolist(), ordids, scores.tolist(), P - + return positions, sortedids, metrics, info -def scoring_orme(cnt: scipy.sparse.csr_matrix, - indices: List[str], - calibration: Optional[str] = 'platt'): - """Scoring based on Orme (2009) - - Parameters: - ----------- - cnt : scipy.sparse.dok.dok_matrix - Quadratic sparse matrix with frequency data - indices : List[str] - Identifiers, e.g. UUID4, of each row/column of the `cnt` matrix. - - calibration: str (Default: None) - The calibrated scores. For 'platt' and 'isotonic' we assume - `label[i]=rowsum[i]>mean(rowsum)`. - - Returns: - -------- - ranked : List[int] - The array positions to order/sort the original data by indexing. - - ordids : List[int] - The item IDs in the new order. - - scores : List[float] - The scores for each item ID. Also sorted in descending order. - - info - the ratios from [-1, +1] - - Example: - -------- - import bwsample as bws - data = ( - ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), - ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), - ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), - ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), - ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), - ) - _, dok_direct, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, ratios = bws.rank( - dok_direct, method='omre', calibration='platt') - References: - ----------- - Orme, B., 2009. MaxDiff Analysis: Simple Counting, Individual-Level - Logit, and HB. https://api.semanticscholar.org/CorpusID:202605777 - """ - # compute ratios - cnt = cnt.tocsr() - metric = (cnt - cnt.T).sum(axis=1) / (cnt + cnt.T).sum(axis=1) - metric = np.array(metric.flatten())[0] - - # sort, larger row sums are better - ranked = np.argsort(-metric) # maximize - ordids = np.array(indices)[ranked].tolist() - scores = metric[ranked] - - # calibrate scores - if calibration in ('platt', 'isotonic'): - labels = scores > 0 # TRUE: s>0 - scores = calibrate(scores, labels, method=calibration) - elif calibration == 'minmax': - scores = minmax(scores) - - # done - return ranked.tolist(), ordids, scores.tolist(), metric - - -def scoring_eigenvector(cnt: scipy.sparse.csr_matrix, - indices: List[str], - calibration: Optional[str] = None): +def eigenvector_estimation(cnt: scipy.sparse.csr_matrix, + indices: List[str]): """Compute the eigenvectors of the pairwise comparison matrix, and calibrate eigenvectors as scores. @@ -389,38 +266,35 @@ def scoring_eigenvector(cnt: scipy.sparse.csr_matrix, indices : List[str] Identifiers, e.g. UUID4, of each row/column of the `cnt` matrix. - calibration: str (Default: None) - The calibrated scores. For 'platt' and 'isotonic' we assume - `label[i]=eigenvector[i]>0.5`. There is also the option to run - Min-Max-Scaling (`'minmax'`) but won't recommend using it. - Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs - scores : List[float] - The scores for each item ID. Also sorted in descending order. + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - info - (eigval, eigenvec) + info : dict + Further information depending on the selected `method`, e.g. + - "eigval": Estimated eigenvalue + - "eigenvec": Estimated eigenvector Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, (eigval, eigenvec) = bws.rank( - dok, method='eigen', calibration=None) + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='eigen') References: ----------- @@ -434,7 +308,9 @@ def scoring_eigenvector(cnt: scipy.sparse.csr_matrix, for i in range(n): cnt[i, i] = 1 - # compute "positive reciprocal near consistent pairwise comparison matrix" + # Compute a sparse "positive reciprocal near consistent pairwise + # comparison matrix". Avoid accidental conversion into dense matrix + # by manipulating the value/data vector of the transposed sp matrix. cnt = cnt.tocsr() cntT = cnt.T cntT.data = 1.0 / cntT.data @@ -442,28 +318,25 @@ def scoring_eigenvector(cnt: scipy.sparse.csr_matrix, # compute eigenvectors as scores eigval, eigenvec = scipy.sparse.linalg.eigs(ratios, k=1) - metric = np.abs(np.real(eigenvec[:, 0])) + metrics = np.abs(np.real(eigenvec[:, 0])) # sort, larger row sums are better - ranked = np.argsort(-metric) # maximize - ordids = np.array(indices)[ranked].tolist() - scores = metric[ranked] + positions = np.argsort(-metrics) # maximize + sortedids = np.array(indices)[positions] + metrics = metrics[positions] - # calibrate scores - if calibration in ('platt', 'isotonic'): - labels = scores > .5 # TRUE: s>.5 - scores = calibrate(scores, labels, method=calibration) - elif calibration == 'minmax': - scores = minmax(scores) + # informations + info = {} + info["eigval"] = eigval + info["eigenvec"] = eigenvec # done - return ranked.tolist(), ordids, scores.tolist(), (eigval, eigenvec) + return positions, sortedids, metrics, info def transition_simulation(cnt: scipy.sparse.dok.dok_matrix, indices: List[str], - n_rounds: Optional[int] = 3, - calibration: Optional[str] = 'minmax'): + n_rounds: Optional[int] = 2): """Estimate transition matrix of item_i>item_j, simulate the item probabilities that are calibrated to scores. @@ -475,39 +348,38 @@ def transition_simulation(cnt: scipy.sparse.dok.dok_matrix, indices : List[str] Identifiers, e.g. UUID4, of each row/column of the `cnt` matrix. - calibration: Optional[str] (Default: 'platt') - The calibrated scores. We are predicting transition probabilities - here, i.e. `SUM[transprob]=1`. Thus, recommend using Min-Max-Scaling - (`'minmax'`). + n_rounds: Optional[int] = 2 + Number of steps/rounds to simulate Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs - scores : List[float] - The scores for each item ID. Also sorted in descending order. + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - info : Tuple - (x, transmat) `x` is is the predicted/simulated item probability, - and `transmat` the estimated transition probability matrix. + info : dict + Further information depending on the selected `method`, e.g. + - "sim": The predicted/simulated item probability + - "transmat: The estimated transition probability matrix. Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, (x, transmat) = bws.rank( - dok, method='transition', n_rounds=3, calibration='minmax') + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='trans', n_rounds=3) """ n = cnt.shape[0] @@ -528,16 +400,17 @@ def transition_simulation(cnt: scipy.sparse.dok.dok_matrix, x = x * transmat # sort, larger state probabilities are better - ranked = np.argsort(-x) # maximize - ordids = np.array(indices)[ranked].tolist() - scores = x[ranked] + positions = np.argsort(-x) # maximize + sortedids = np.array(indices)[positions] + metrics = x[positions] - # calibrate scores - if calibration == 'minmax': - scores = minmax(scores) + # informations + info = {} + info["sim"] = x + info["transmat"] = transmat # done - return ranked.tolist(), ordids, scores.tolist(), (x, transmat) + return positions, sortedids, metrics, info def mle_btl_sparse(cnt: scipy.sparse.csr_matrix, @@ -572,9 +445,8 @@ def mle_btl_sparse(cnt: scipy.sparse.csr_matrix, References: ----------- - Hunter, D.R., 2004. MM algorithms for generalized Bradley-Terry models. - The Annals of Statistics 32, 384–406. - https://doi.org/10.1214/aos/1079120141 + Hunter, D.R., 2004. MM algorithms for generalized Bradley-Terry models. The + Annals of Statistics 32, 384–406. https://doi.org/10.1214/aos/1079120141 """ # ensure CSR format cnt = cnt.tocsr() @@ -626,12 +498,18 @@ def mle_btl_sparse(cnt: scipy.sparse.csr_matrix, return np.array(x1.flatten())[0], False -def ranking_btl(cnt: scipy.sparse.csr_matrix, - indices: List[str], - calibration: Optional[str] = 'minmax', - prefit: Optional[bool] = True, - max_iter: Optional[int] = 50, - tol: Optional[float] = 1e-5): +def minmax(arr: np.array) -> np.array: + data = np.array(arr) + xmin = data.min() + xmax = data.max() + return (data - xmin) / (xmax - xmin) + + +def bradley_terry_probability(cnt: scipy.sparse.csr_matrix, + indices: List[str], + prefit: Optional[bool] = True, + max_iter: Optional[int] = 50, + tol: Optional[float] = 1e-5): """Bradley-Terry-Luce (BTL) probability model for pairwise comparisons Parameters: @@ -642,9 +520,6 @@ def ranking_btl(cnt: scipy.sparse.csr_matrix, indices : List[str] Identifiers, e.g. UUID4, of each row/column of the `cnt` matrix. - calibration: str (Default: None) - The calibrated scores. We recommend using Min-Max-Scaling (`'minmax'`) - prefit : bool flag to prefit parameters with 'ratio' method (see `ranking_maximize_ratios`) @@ -657,31 +532,32 @@ def ranking_btl(cnt: scipy.sparse.csr_matrix, Returns: -------- - ranked : List[int] + positions : np.array[uint64] The array positions to order/sort the original data by indexing. - ordids : List[int] - The item IDs in the new order. + sortedids : np.array[any] + The reordered item IDs - scores : List[float] - The scores for each item ID. Also sorted in descending order. + metrics : np.array[float] + The metric for each item ID. Also sorted in descending order. - info - x are the estimated MLE parameters that can be used for scoring + info : dict + Further information depending on the selected `method`, e.g. + - "weights": The estimated MLE parameters that can be used for scoring Example: -------- import bwsample as bws - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _, _, _ = bws.extract_pairs_batch2(data) - ranked, ordids, scores, x = bws.rank( - dok, method='btl', calibration='minmax') + agg_dok, _, _, _, _ = bws.count(evaluations) + positions, sortedids, metrics, info = bws.rank( + agg_dok, method='btl', prefit=True, max_iter=100, tol=1e-5) """ cnt = cnt.tocsr() x0 = None @@ -696,13 +572,13 @@ def ranking_btl(cnt: scipy.sparse.csr_matrix, x, flag = mle_btl_sparse(cnt, x0=x0, max_iter=max_iter, tol=tol) # sort, larger state probabilities are better - ranked = np.argsort(-x) # maximize - ordids = np.array(indices)[ranked].tolist() - scores = x[ranked] + positions = np.argsort(-x) # maximize + sortedids = np.array(indices)[positions] + metrics = x[positions] - # calibrate scores (only minmax!) - if calibration == 'minmax': - scores = minmax(scores) + # informations + info = {} + info["weights"] = x # done - return ranked.tolist(), ordids, scores.tolist(), x + return positions, sortedids, metrics, info diff --git a/bwsample/utils.py b/bwsample/utils.py index 5d62324..a198a9c 100644 --- a/bwsample/utils.py +++ b/bwsample/utils.py @@ -1,8 +1,11 @@ -import itertools -import scipy.sparse -import numpy as np -from typing import Dict, Tuple, List -ItemID = str +import itertools # to_scipy +import scipy.sparse # to_scipy +import numpy as np # to_scipy, calibrate +import sklearn.linear_model # adjustscore +import sklearn.preprocessing # adjustscore +import scipy.special # adjustscore +from typing import Dict, Tuple, List, Optional +ItemID = str # add_dok def to_scipy(dok: Dict[Tuple[str, str], int], dtype=np.float64) -> ( @@ -79,3 +82,60 @@ def add_dok(a: Dict[Tuple[ItemID, ItemID], int], out[key] = val + out.get(key, 0) # done return out + + +def adjustscore(scores: np.array, + method: Optional[str] = 'quantile', + n_quantiles: Optional[int] = 10000, + labels: Optional[np.array] = None) -> np.array: + """Wrapper function to adjust scores + + Parameters: + ----------- + scores: np.array + The scores generated by a model. + + method: str (Default: None) + The calibration algorithm: + - 'quantile' -- sklearn's quantile transform + - 'sig3iqr' -- sigmoid 3x sklearn's robust scaler with (25%,75%) + - 'platt' -- calibrate scores with the binary labels (Platt, 1999) + + n_quantiles: Optional[int] = 10000 + Parameter for `method='quantile'` + + labels: Optional[np.array] + For `method='platt'`. The binary labels that are supposed to be + classified by the scores. + + Return: + ------- + adjusted : np.array + The adjusted scores + + References: + ----------- + Platt, J., 1999. Probabilistic outputs for support vector machines and + comparisons to regularized likelihood methods. + """ + scores = np.array(scores) + labels = np.array(labels) + + if method == 'quantile': + return sklearn.preprocessing.quantile_transform( + X=scores.reshape(-1, 1), + n_quantiles=min(n_quantiles, len(scores)), + output_distribution='uniform') + + elif method == 'sig3iqr': + adjusted = sklearn.preprocessing.robust_scale( + X=scores, quantile_range=(25, 75)) + return scipy.special.expit(3 * adjusted) + + elif method == 'platt': + cls = sklearn.linear_model.LogisticRegression() + cls.fit(X=scores.reshape(-1, 1), y=labels) + return cls.predict_proba(scores.reshape(-1, 1))[:, 1] + + else: + raise Exception(f"The method='{method}' is not implemented.") diff --git a/test/test_adjustscore.py b/test/test_adjustscore.py new file mode 100644 index 0000000..4b7a187 --- /dev/null +++ b/test/test_adjustscore.py @@ -0,0 +1,41 @@ +import bwsample as bws +import numpy as np +import random + + +def test1(): + scores = [random.random() for _ in range(1000)] + adjusted = bws.adjustscore(scores, method='quantile') + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() + + +def test2(): + scores = [.1, .3, .5, .7] + adjusted = bws.adjustscore(scores, method='quantile') + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() + + +def test3(): + scores = [random.random() for _ in range(1000)] + adjusted = bws.adjustscore(scores, method='sig3iqr') + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() + + +def test4(): + scores = [.1, .3, .5, .7] + adjusted = bws.adjustscore(scores, method='sig3iqr') + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() + + +def test5(): + scores = [random.random() for _ in range(1000)] + labels = [s > 0.5 for s in scores] + adjusted = bws.adjustscore(scores, method='platt', labels=labels) + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() + + +def test6(): + scores = [.1, .3, .5, .7] + labels = [s > 0.5 for s in scores] + adjusted = bws.adjustscore(scores, method='platt', labels=labels) + assert np.argsort(adjusted).tolist() == np.argsort(adjusted).tolist() diff --git a/test/test_rank.py b/test/test_rank.py index b36c7d3..68c14bb 100644 --- a/test/test_rank.py +++ b/test/test_rank.py @@ -3,46 +3,44 @@ def test1(): # demo data - data = ( + evaluations = ( ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), ([0, 1, 2, 0], ['A', 'B', 'C', 'D']), ([0, 1, 0, 2], ['A', 'B', 'C', 'D']), ) - dok, _ = bws.counting.direct_extract_batch(data) + dok, _, _, _, _ = bws.count(evaluations) # possible settings settings = [ - {"method": "ratio", "avg": "all", "calibration": "platt"}, - {"method": "ratio", "avg": "exist", "calibration": "platt"}, - {"method": "ratio", "avg": "all", "calibration": "isotonic"}, - {"method": "ratio", "avg": "exist", "calibration": "isotonic"}, - {"method": "ratio", "avg": "all", "calibration": "minmax"}, - {"method": "ratio", "avg": "exist", "calibration": "minmax"}, - {"method": "pvalue", "avg": "all", "calibration": "platt"}, - {"method": "pvalue", "avg": "exist", "calibration": "platt"}, - {"method": "pvalue", "avg": "all", "calibration": "isotonic"}, - {"method": "pvalue", "avg": "exist", "calibration": "isotonic"}, - {"method": "pvalue", "avg": "all", "calibration": "minmax"}, - {"method": "pvalue", "avg": "exist", "calibration": "minmax"}, - {"method": "btl", "calibration": "platt"}, - {"method": "btl", "calibration": "isotonic"}, - {"method": "btl", "calibration": "minmax"}, - {"method": "orme", "calibration": "platt"}, - {"method": "orme", "calibration": "isotonic"}, - {"method": "orme", "calibration": "minmax"}, - {"method": "eigen", "calibration": "platt"}, - {"method": "eigen", "calibration": "isotonic"}, - {"method": "eigen", "calibration": "minmax"}, - {"method": "transition", "calibration": "platt"}, - {"method": "transition", "calibration": "isotonic"}, - {"method": "transition", "calibration": "minmax"} + {"method": "ratio", "avg": "all", "adjust": "platt"}, + {"method": "ratio", "avg": "exist", "adjust": "platt"}, + {"method": "ratio", "avg": "all", "adjust": "quantile"}, + {"method": "ratio", "avg": "exist", "adjust": "quantile"}, + {"method": "ratio", "avg": "all", "adjust": "sig3iqr"}, + {"method": "ratio", "avg": "exist", "adjust": "sig3iqr"}, + {"method": "pvalue", "avg": "all", "adjust": "platt"}, + {"method": "pvalue", "avg": "exist", "adjust": "platt"}, + {"method": "pvalue", "avg": "all", "adjust": "quantile"}, + {"method": "pvalue", "avg": "exist", "adjust": "quantile"}, + {"method": "pvalue", "avg": "all", "adjust": "sig3iqr"}, + {"method": "pvalue", "avg": "exist", "adjust": "sig3iqr"}, + {"method": "btl", "adjust": "platt"}, + {"method": "btl", "adjust": "quantile"}, + {"method": "btl", "adjust": "sig3iqr"}, + {"method": "eigen", "adjust": "platt"}, + {"method": "eigen", "adjust": "quantile"}, + {"method": "eigen", "adjust": "sig3iqr"}, + {"method": "trans", "adjust": "platt"}, + {"method": "trans", "adjust": "quantile"}, + {"method": "trans", "adjust": "sig3iqr"} ] # loop over each setting for setting in settings: - ranked, ordids, scores, _ = bws.rank(dok, **setting) - assert len(ranked) == 4 - assert len(ordids) == 4 + positions, sortedids, metrics, scores, info = bws.rank(dok, **setting) + assert len(positions) == 4 + assert len(sortedids) == 4 + assert len(metrics) == 4 assert len(scores) == 4 From e861fb0bcdd6b1132f5e0468e867b2e5b9bf4f86 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Sat, 13 Mar 2021 18:33:03 +0100 Subject: [PATCH 4/7] example notebook added --- docs/rank.ipynb | 325 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 docs/rank.ipynb diff --git a/docs/rank.ipynb b/docs/rank.ipynb new file mode 100644 index 0000000..71d4a73 --- /dev/null +++ b/docs/rank.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "educational-surge", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "adequate-flash", + "metadata": {}, + "outputs": [], + "source": [ + "import bwsample as bws\n", + "import numpy as np\n", + "\n", + "#import matplotlib.pyplot as plt\n", + "#%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "simplified-finding", + "metadata": {}, + "source": [ + "# Prepare toy data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "otherwise-asset", + "metadata": {}, + "outputs": [], + "source": [ + "evaluations = (\n", + " ([1, 0, 0, 2], ['A', 'B', 'C', 'D']),\n", + " ([1, 0, 0, 2], ['A', 'B', 'C', 'D']), \n", + " ([2, 0, 0, 1], ['A', 'B', 'C', 'D']), \n", + " ([1, 2, 0, 0], ['D', 'E', 'F', 'A']),\n", + " ([0, 2, 1, 0], ['D', 'E', 'F', 'A']),\n", + " ([0, 0, 1, 2], ['D', 'E', 'F', 'A'])\n", + ")\n", + "\n", + "dok, _, _, _, _ = bws.count(evaluations)" + ] + }, + { + "cell_type": "markdown", + "id": "northern-copyright", + "metadata": {}, + "source": [ + "# Simple Ratios" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "demographic-nashville", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Positions: [5 3 1 2 0 4]\n", + "Ordered IDs: ['F' 'D' 'B' 'C' 'A' 'E']\n", + " Scores: [1. 0.8 0.5 0.5 0.2 0. ]\n" + ] + } + ], + "source": [ + "positions, sortedids, metrics, scores, info = bws.rank(\n", + " dok, method='ratio', avg='exist', adjust='quantile')\n", + "\n", + "#print(np.max(scores), np.min(scores))\n", + "#plt.hist(scores);\n", + "\n", + "print(f\" Positions: {positions}\") \n", + "print(f\"Ordered IDs: {sortedids}\") \n", + "print(f\" Scores: {scores}\") " + ] + }, + { + "cell_type": "markdown", + "id": "metric-blast", + "metadata": {}, + "source": [ + "# 1-Minus p-Values" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "spatial-lexington", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Positions: [5 3 1 2 0 4]\n", + "Ordered IDs: ['F' 'D' 'B' 'C' 'A' 'E']\n", + " Scores: [1. 0.8 0.5 0.5 0.2 0. ]\n" + ] + } + ], + "source": [ + "positions, sortedids, metrics, scores, info = bws.rank(\n", + " dok, method='pvalue', avg='exist', adjust='quantile')\n", + "\n", + "print(f\" Positions: {positions}\") \n", + "print(f\"Ordered IDs: {sortedids}\") \n", + "print(f\" Scores: {scores}\") " + ] + }, + { + "cell_type": "markdown", + "id": "universal-recall", + "metadata": {}, + "source": [ + "# Bradley-Terry-Luce (BTL) model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "continuing-underwear", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Positions: [4 5 3 0 1 2]\n", + "Ordered IDs: ['E' 'F' 'D' 'A' 'B' 'C']\n", + " Scores: [1. 0.8 0.6 0.4 0. 0. ]\n" + ] + } + ], + "source": [ + "positions, sortedids, metrics, scores, info = bws.rank(\n", + " dok, method='btl', adjust='quantile')\n", + "\n", + "print(f\" Positions: {positions}\") \n", + "print(f\"Ordered IDs: {sortedids}\") \n", + "print(f\" Scores: {scores}\") " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "limiting-mouth", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated MLE parameters: [0.10324378 0.07756639 0.07756639 0.13433648 0.36627794 0.24100903] \n", + "\n" + ] + } + ], + "source": [ + "print(f\"Estimated MLE parameters: {info['weights']}\", \"\\n\") " + ] + }, + { + "cell_type": "markdown", + "id": "undefined-tattoo", + "metadata": {}, + "source": [ + "# Eigenvector" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "described-examination", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Positions: [5 3 1 2 0 4]\n", + "Ordered IDs: ['F' 'D' 'B' 'C' 'A' 'E']\n", + " Scores: [1. 0.8 0.575 0.4 0.2 0. ]\n" + ] + } + ], + "source": [ + "positions, sortedids, metrics, scores, info = bws.rank(\n", + " dok, method='eigen', adjust='quantile')\n", + "\n", + "print(f\" Positions: {positions}\") \n", + "print(f\"Ordered IDs: {sortedids}\") \n", + "print(f\" Scores: {scores}\") " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "artistic-worse", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated eigenvector: [0.1991372 0.21750303 0.21750303 0.25644901 0.03970756 0.89352474]\n" + ] + } + ], + "source": [ + "print(f\"Estimated eigenvector: {np.abs(np.real(info['eigenvec'].reshape(-1)))}\")" + ] + }, + { + "cell_type": "markdown", + "id": "typical-shanghai", + "metadata": {}, + "source": [ + "# Transition Simulation" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "demanding-origin", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Positions: [5 3 0 2 1 4]\n", + "Ordered IDs: ['F' 'D' 'A' 'C' 'B' 'E']\n", + " Scores: [1. 0.8 0.6 0.4 0.2 0. ]\n" + ] + } + ], + "source": [ + "positions, sortedids, metrics, scores, info = bws.rank(\n", + " dok, method='trans', adjust='quantile')\n", + "\n", + "print(f\" Positions: {positions}\") \n", + "print(f\"Ordered IDs: {sortedids}\") \n", + "print(f\" Scores: {scores}\") " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "handled-theorem", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simulated state probabilities: [0.11996576 0.10923823 0.10923823 0.12359208 0.04837442 0.13556368]\n" + ] + } + ], + "source": [ + "print(f\"Simulated state probabilities: {info['sim']}\") " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "imposed-identity", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Estimated Transition Matrix:\n", + "[[0.399 0.079 0.079 0.088 0.012 0.111]\n", + " [0.078 0.398 0.084 0.119 0.036 0.117]\n", + " [0.078 0.084 0.398 0.119 0.036 0.117]\n", + " [0.115 0.055 0.055 0.395 0.026 0.082]\n", + " [0.085 0.087 0.087 0.125 0.378 0.126]\n", + " [0.107 0.107 0.107 0.057 0.01 0.407]]\n" + ] + } + ], + "source": [ + "print(\"Estimated Transition Matrix:\")\n", + "print(info['transmat'].todense().round(3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9c39ee69f2e3fa7de20cf557b0f61a83ad8780c6 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Sat, 13 Mar 2021 18:33:16 +0100 Subject: [PATCH 5/7] formatting issues fixed --- bwsample/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwsample/utils.py b/bwsample/utils.py index a198a9c..2077e94 100644 --- a/bwsample/utils.py +++ b/bwsample/utils.py @@ -125,17 +125,17 @@ def adjustscore(scores: np.array, return sklearn.preprocessing.quantile_transform( X=scores.reshape(-1, 1), n_quantiles=min(n_quantiles, len(scores)), - output_distribution='uniform') + output_distribution='uniform').reshape(-1) elif method == 'sig3iqr': adjusted = sklearn.preprocessing.robust_scale( X=scores, quantile_range=(25, 75)) - return scipy.special.expit(3 * adjusted) + return scipy.special.expit(3 * adjusted).reshape(-1) elif method == 'platt': cls = sklearn.linear_model.LogisticRegression() cls.fit(X=scores.reshape(-1, 1), y=labels) - return cls.predict_proba(scores.reshape(-1, 1))[:, 1] + return cls.predict_proba(scores.reshape(-1, 1))[:, 1].reshape(-1) else: raise Exception(f"The method='{method}' is not implemented.") From 13a5fe47239a1f5d378a794635f5ace6bc708065 Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Sat, 13 Mar 2021 18:33:31 +0100 Subject: [PATCH 6/7] matplotlib added --- requirements-demo.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-demo.txt b/requirements-demo.txt index 8b63c34..90fe6a5 100644 --- a/requirements-demo.txt +++ b/requirements-demo.txt @@ -1,3 +1,4 @@ # packages required to run example notebooks jupyterlab>=3.0.5 pandas>=1.1.5 +matplotlib>=1.15.0 From 1ea31ed0895fd8d34e86a132439550787dc324bf Mon Sep 17 00:00:00 2001 From: UH <554c46@gmail.com> Date: Sat, 13 Mar 2021 18:38:23 +0100 Subject: [PATCH 7/7] readme updated for ranking --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d1d9557..d7a6125 100644 --- a/README.md +++ b/README.md @@ -143,9 +143,21 @@ The function `bwsample.rank` computes a python index variable with a proposed or ```python import bwsample as bws -ranked, ordids, scores, info = bws.ranking(dok, method='ratio') +ranked, ordids, scores, info = bws.ranking(dok, method='ratio', adjust='quantile') ``` +**Available methods:** +Computed from extracted pairs: + +- `'ratio'` -- Simple ratios for each pair, and sum ratios for each item. +- `'pvalue'` -- Chi-Squared based p-value for each pair, and sum 1-pval for each item. +- `'btl'` -- Bradley-Terry-Luce (BTL) model estimated with MM algorithm (Hunter, 2004). +- `'eigen'` -- Eigenvectors of the reciprocal pairwise comparison matrix (Saaty, 2003). +- `'trans'` -- Estimate transition probability of the next item to be better. + +The implementations `ratio`, `pvalue`, `'btl'`, `'eigen'`, and `'trans'` are fully based on sparse matrix operations and `scipy.sparse` algorithms, and avoid accidental conversions to dense matrices. + + **References:** - Eigenvector solution in: Saaty, T. L. (2003). Decision-making with the AHP: Why is the principal eigenvector nec- essary. European Journal of Operational Research, 145(1), 85–91. [https://doi.org/10.1016/S0377-2217(02)00227-8](https://doi.org/10.1016/S0377-2217(02)00227-8)