Skip to content

Commit

Permalink
Merge pull request #47 from outbrain/some-fixes
Browse files Browse the repository at this point in the history
Renormalization for randomized-based heuristics
  • Loading branch information
SkBlaz authored Oct 9, 2023
2 parents 05f9039 + 1c8b7b7 commit 2fc30f2
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
15 changes: 8 additions & 7 deletions outrank/algorithms/feature_ranking/ranking_mi_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,25 @@ def compute_entropies(

initial_prob = _f_value_counts / all_events
x_value_subspace = np.where(X == f_values[f_index])

Y_classes = Y[x_value_subspace]
index = 0
Y_classes_spoofed = np.roll(Y, _f_value_counts)[x_value_subspace]

nonzero_class_counts = np.zeros(len(class_values), dtype=np.int32)
nonzero_class_counts_spoofed = np.zeros(len(class_values), dtype=np.int32)

# Cache nonzero counts
for c in class_values:
for index, c in enumerate(class_values):
nonzero_class_counts[index] = np.count_nonzero(Y_classes == c)
index += 1
nonzero_class_counts_spoofed[index] = np.count_nonzero(Y_classes_spoofed == c)

conditional_entropy += compute_conditional_entropy(
Y_classes, class_values, _f_value_counts, initial_prob, nonzero_class_counts,
)

if cardinality_correction:
# A neat hack that seems to work fine (permutations are expensive)
Y_classes = np.roll(Y, _f_value_counts)[x_value_subspace]

background_cond_entropy += compute_conditional_entropy(
Y_classes, class_values, _f_value_counts, initial_prob, nonzero_class_counts,
Y_classes_spoofed, class_values, _f_value_counts, initial_prob, nonzero_class_counts_spoofed,
)

if not cardinality_correction:
Expand Down
2 changes: 1 addition & 1 deletion outrank/core_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def mixed_rank_graph(

# Handle cont. types prior to interaction evaluation
pbar.set_description('Encoding columns')
col_dots = '.'
start_enc_timer = timer()
tmp_df = pd.DataFrame({k : tmp_df[k].cat.codes for k in all_columns})

end_enc_timer = timer()
out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _read_description():
packages = [x for x in setuptools.find_packages() if x != 'test']
setuptools.setup(
name='outrank',
version='0.94.2',
version='0.95',
description='OutRank: Feature ranking for massive sparse data sets.',
long_description=_read_description(),
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 2fc30f2

Please sign in to comment.