Skip to content

Commit

Permalink
REF-1523-TLDR mode (#85)
Browse files Browse the repository at this point in the history
* tldr mode

* tldr
  • Loading branch information
SkBlaz authored Oct 23, 2024
1 parent 9b07ab8 commit b5f4a2c
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 26 deletions.
7 changes: 7 additions & 0 deletions outrank/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,13 @@ def main():
help='Relevant for task data_generator -- how many features.',
)

parser.add_argument(
'--tldr',
type=str,
default='True',
help='If enabled, it will output some of the main results on the screen after finishing.',
)

parser.add_argument(
'--num_synthetic_rows',
type=int,
Expand Down
72 changes: 54 additions & 18 deletions outrank/task_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,65 +3,101 @@
import logging
import os
from collections import defaultdict
from typing import Any
from typing import List

import numpy as np
import pandas as pd

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)


def outrank_task_result_summary(args):
triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
def read_and_sort_triplets(triplets_path: str) -> pd.DataFrame:
"""Read triplets from a file and sort by the 'Score' column."""
triplets = pd.read_csv(triplets_path, sep='\t')
triplets = triplets.sort_values(by='Score', ascending=False)
return triplets.sort_values(by='Score', ascending=False)


def generate_final_ranking(triplets: pd.DataFrame, label_column: str) -> list[list[Any]]:
"""Generate final ranking based on the label column."""
final_ranking = []
for _, row in triplets.iterrows():
feature_a, feature_b = row['FeatureA'], row['FeatureB']
score = row['Score']
if args.label_column == feature_a.split('-')[0]:
if label_column == feature_a.split('-')[0]:
final_ranking.append([feature_b, score])
elif args.label_column == feature_b.split('-')[0]:
elif label_column == feature_b.split('-')[0]:
final_ranking.append([feature_a, score])
return final_ranking

final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {args.heuristic}'])

def create_final_dataframe(final_ranking: list[list[Any]], heuristic: str) -> pd.DataFrame:
"""Create a final DataFrame and normalize if necessary."""
final_df = pd.DataFrame(final_ranking, columns=['Feature', f'Score {heuristic}'])
final_df = (
final_df.groupby('Feature')
.median()
.reset_index()
.sort_values(by=f'Score {args.heuristic}', ascending=False)
.sort_values(by=f'Score {heuristic}', ascending=False)
)

if "MI" in args.heuristic:
min_score = final_df[f'Score {args.heuristic}'].min()
max_score = final_df[f'Score {args.heuristic}'].max()
final_df[f'Score {args.heuristic}'] = (final_df[f'Score {args.heuristic}'] - min_score) / (max_score - min_score)
if 'MI' in heuristic:
min_score = final_df[f'Score {heuristic}'].min()
max_score = final_df[f'Score {heuristic}'].max()
final_df[f'Score {heuristic}'] = (final_df[f'Score {heuristic}'] - min_score) / (max_score - min_score)

return final_df

logging.info(f'Storing summary files to {args.output_folder}')

def store_summary_files(final_df: pd.DataFrame, output_folder: str, heuristic: str, tldr: bool) -> None:
"""Store the summary files and optionally print the head of the DataFrame."""
logging.info(f'Storing summary files to {output_folder}')
pd.set_option('display.max_rows', None, 'display.max_columns', None)

singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
singles_path = os.path.join(output_folder, 'feature_singles.tsv')
final_df.to_csv(singles_path, sep='\t', index=False)

if args.interaction_order > 1:
if tldr:
print(final_df.head(20))


def handle_interaction_order(final_df: pd.DataFrame, output_folder: str, heuristic: str, interaction_order: int) -> None:
"""Handle the interaction order if it is greater than 1."""
if interaction_order > 1:
feature_store = defaultdict(list)
for _, row in final_df.iterrows():
fname = row['Feature']
score = row[f'Score {args.heuristic}']
score = row[f'Score {heuristic}']
if 'AND' in fname:
for el in fname.split('-')[0].split(' AND '):
feature_store[el].append(score)

final_aggregate_df = pd.DataFrame([
{
'Feature': k,
f'Combined score (order: {args.interaction_order}, {args.heuristic})': np.median(v),
f'Combined score (order: {interaction_order}, {heuristic})': np.median(v),
}
for k, v in feature_store.items()
])
final_aggregate_df.to_csv(
os.path.join(args.output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False
os.path.join(output_folder, 'feature_singles_aggregated.tsv'), sep='\t', index=False,
)

transformers_only_path = singles_path.replace('.tsv', '_transformers_only_imp.tsv')

def filter_transformers_only(final_df: pd.DataFrame, output_folder: str) -> None:
"""Filter the DataFrame to include only transformer features and store the result."""
transformers_only_path = os.path.join(output_folder, 'feature_singles_transformers_only_imp.tsv')
final_df[final_df['Feature'].str.contains('_tr_')].to_csv(transformers_only_path, sep='\t', index=False)


def outrank_task_result_summary(args) -> None:
"""Main function to generate a summary of outrank task results."""
triplets_path = os.path.join(args.output_folder, 'pairwise_ranks.tsv')
triplets = read_and_sort_triplets(triplets_path)

final_ranking = generate_final_ranking(triplets, args.label_column)
final_df = create_final_dataframe(final_ranking, args.heuristic)

store_summary_files(final_df, args.output_folder, args.heuristic, args.tldr)
handle_interaction_order(final_df, args.output_folder, args.heuristic, args.interaction_order)
filter_transformers_only(final_df, args.output_folder)
14 changes: 7 additions & 7 deletions outrank/visualizations/ranking_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def visualize_hierarchical_clusters(
values='Score',
index='FeatureA',
columns='FeatureB',
aggfunc=np.mean,
aggfunc='mean', # Updated from np.mean to 'mean'
)

pivot_table.fillna(0, inplace=True)
Expand All @@ -59,7 +59,7 @@ def visualize_hierarchical_clusters(
)
plt.title(f'Linkage function: {linkage_heuristic}')
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', UserWarning)
plt.tight_layout()
out_path = f'{output_folder}/dendrogram_{linkage_heuristic}.{image_format}'
plt.savefig(out_path, dpi=300)
Expand Down Expand Up @@ -95,7 +95,7 @@ def visualize_hierarchical_clusters(
dfx.columns = ['Silhouette', 'threshold', 'numClusters']
sns.lineplot(x='numClusters', y='Silhouette', data=dfx, color='black')
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', UserWarning)
plt.tight_layout()
out_path = f'{output_folder}/SilhouetteProfile.{image_format}'
plt.savefig(out_path, dpi=300)
Expand All @@ -113,7 +113,7 @@ def visualize_hierarchical_clusters(
projected_data['ClusterID'] = top_clustering.astype(str)
sns.scatterplot(x='Dim1', y='Dim2', hue='ClusterID', data=projected_data, palette='Set2')
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', UserWarning)
plt.tight_layout()
plt.savefig(f'{output_folder}/clustersEmbeddingVisualization.pdf', dpi=300)
plt.clf()
Expand All @@ -130,7 +130,7 @@ def visualize_heatmap(
sns.set(font_scale=2)
fig, ax = plt.subplots()
pivot_table = pd.pivot_table(
triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc=np.mean,
triplets, values='Score', index='FeatureA', columns='FeatureB', aggfunc='mean', # Updated from np.mean to 'mean'
)
mask = np.zeros_like(pivot_table.values)
mask[np.triu_indices_from(mask)] = True
Expand Down Expand Up @@ -160,7 +160,7 @@ def visualize_heatmap(
plt.xlabel('')
plt.ylabel('')
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', UserWarning)
plt.tight_layout()
plt.savefig(f'{output_folder}/heatmap.{image_format}', dpi=500)
plt.clf()
Expand Down Expand Up @@ -245,7 +245,7 @@ def visualize_barplots(
plt.xlabel(f'Feature importance (based on heuristic {heuristic})')
plt.ylabel('')
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter('ignore', UserWarning)
plt.tight_layout()
plt.savefig(f'{output_folder}/barplot_top_{subset_range}.{image_format}', dpi=300)
plt.clf()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _read_description():
packages = [x for x in setuptools.find_packages() if x != 'test']
setuptools.setup(
name='outrank',
version='0.97.3',
version='0.97.4',
description='OutRank: Feature ranking for massive sparse data sets.',
long_description=_read_description(),
long_description_content_type='text/markdown',
Expand Down

0 comments on commit b5f4a2c

Please sign in to comment.