diff --git a/src/haddock/clis/cli_analyse.py b/src/haddock/clis/cli_analyse.py index c952ed1f3..88b1beb66 100644 --- a/src/haddock/clis/cli_analyse.py +++ b/src/haddock/clis/cli_analyse.py @@ -318,11 +318,11 @@ def zip_top_ranked(capri_filename: FilePath, cluster_ranking: ClRank, summary_na path to the zipped file """ capri_df = read_capri_table(capri_filename, comment="#") - gb_cluster = capri_df.groupby("cluster-id") + gb_cluster = capri_df.groupby("cluster_id") for cl_id, cl_df in gb_cluster: if cl_id in cluster_ranking.keys(): if cl_id != "-": - structs = cl_df.loc[cl_df["model-cluster-ranking"] <= 4][["model", "model-cluster-ranking"]] + structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]] else: structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]] structs.columns = ["model", "rank"] diff --git a/src/haddock/libs/libinteractive.py b/src/haddock/libs/libinteractive.py index 189a8ab33..487caf30c 100644 --- a/src/haddock/libs/libinteractive.py +++ b/src/haddock/libs/libinteractive.py @@ -29,7 +29,7 @@ def handle_ss_file( """ # now we want to calculate mean and std dev of the scores on df_ss # first groupby score - df_ss_grouped = df_ss.groupby("cluster-ranking") + df_ss_grouped = df_ss.groupby("cluster_ranking") # sort the dataframe by score df_ss.sort_values(by=["score"], inplace=True) # calculate the mean and standard deviation of the first 4 elements @@ -43,10 +43,10 @@ def handle_ss_file( # get the index that sorts the array by the first column clt_ranks = np.argsort(new_values[:, 0]) # adjust clustering values if there are clusters - if list(np.unique(df_ss["cluster-id"])) != ["-"]: - df_ss['model-cluster-ranking'] = df_ss.groupby('cluster-id')['score'].rank(ascending=True).astype(int) # noqa : E501 - # assign to the values of cluster-ranking the corresponding clt_ranks - df_ss["cluster-ranking"] = df_ss["cluster-ranking"].apply(lambda x: clt_ranks[x - 1] + 1) # noqa : E501 + if list(np.unique(df_ss["cluster_id"])) != ["-"]: + df_ss['model-cluster_ranking'] = df_ss.groupby('cluster_id')['score'].rank(ascending=True).astype(int) # noqa : E501 + # assign to the values of cluster_ranking the corresponding clt_ranks + df_ss["cluster_ranking"] = df_ss["cluster_ranking"].apply(lambda x: clt_ranks[x - 1] + 1) # noqa : E501 # assign to the column caprieval_rank the index of the dataframe df_ss.index = range(1, len(df_ss) + 1) df_ss["caprieval_rank"] = df_ss.index @@ -82,13 +82,13 @@ def rewrite_capri_tables( f"../{model.path.split('/')[-1]}/{model.file_name}" for model in clt_dic[cl] ] - # all the models should now have the cluster-id field - df_ss.loc[df_ss['model'].isin(models), 'cluster-id'] = cl + # all the models should now have the cluster_id field + df_ss.loc[df_ss['model'].isin(models), 'cluster_id'] = cl # delete all the models that are not in the clusters - df_ss = df_ss[df_ss['cluster-id'] != "-"] - # assign cluster-ranking to cluster-id (aka random assignment) - df_ss['cluster-ranking'] = df_ss['cluster-id'] + df_ss = df_ss[df_ss['cluster_id'] != "-"] + # assign cluster_ranking to cluster_id (aka random assignment) + df_ss['cluster_ranking'] = df_ss['cluster_id'] # handle ss file df_ss, clt_ranks, _new_values = handle_ss_file(df_ss) @@ -154,7 +154,7 @@ def handle_clt_file(df_ss, clt_ranks): """ capri_keys = ["score", "irmsd", "fnat", "lrmsd", "dockq"] model_keys = ["air", "bsa", "desolv", "elec", "total", "vdw"] - df_ss_grouped = df_ss.groupby("cluster-id") + df_ss_grouped = df_ss.groupby("cluster_id") # loop over df_ss_grouped cl_data = [] for i, clt_id in enumerate(df_ss_grouped): diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py index 6e3155be8..4d987315a 100644 --- a/src/haddock/libs/libplots.py +++ b/src/haddock/libs/libplots.py @@ -262,19 +262,19 @@ def box_plot_plotly( color_idx = (cl_rank[cl_id] - 1) % len(colors) # color index # Note: the rank format (float/int) in "cl_rank" is different from - # gb_full["cluster-ranking"] - rns = gb_full[gb_full["cluster-id"] == cl_id]["cluster-ranking"] + # gb_full["cluster_ranking"] + rns = gb_full[gb_full["cluster_id"] == cl_id]["cluster_ranking"] rn = rns.unique()[0] color_map[f"{rn}"] = colors[color_idx] # Choose a different color for "Other" like in scatter plots color_map["Other"] = "#DDDBDA" - # to use color_discrete_map, cluster-ranking column should be str not int - gb_full_string = gb_full.astype({"cluster-ranking": "string"}) + # to use color_discrete_map, cluster_ranking column should be str not int + gb_full_string = gb_full.astype({"cluster_ranking": "string"}) # Rename for a better name in legend - gb_full_string.rename(columns={"cluster-ranking": "Cluster Rank"}, inplace=True) + gb_full_string.rename(columns={"cluster_ranking": "Cluster Rank"}, inplace=True) # "Cluster Rank" is equivalent to "capri_rank"! fig = px.box( @@ -319,7 +319,7 @@ def box_plot_data(capri_df: pd.DataFrame, cl_rank: ClRank) -> pd.DataFrame: gb_full : pandas DataFrame DataFrame of all the clusters to be plotted """ - gb_cluster = capri_df.groupby("cluster-id") + gb_cluster = capri_df.groupby("cluster_id") gb_other = pd.DataFrame([]) gb_good = pd.DataFrame([]) for cl_id, cl_df in gb_cluster: @@ -329,9 +329,9 @@ def box_plot_data(capri_df: pd.DataFrame, cl_rank: ClRank) -> pd.DataFrame: cl_df["capri_rank"] = cl_rank[cl_id] # type: ignore gb_good = pd.concat([gb_good, cl_df]) - gb_other["cluster-id"] = "Other" + gb_other["cluster_id"] = "Other" gb_other["capri_rank"] = len(cl_rank.keys()) + 1 - gb_other["cluster-ranking"] = "Other" + gb_other["cluster_ranking"] = "Other" gb_full = pd.concat([gb_good, gb_other]) # Sort based on "capri_rank" @@ -392,7 +392,7 @@ def scatter_plot_plotly( Parameters ---------- gb_cluster : pandas DataFrameGroupBy - capri DataFrame grouped by cluster-id + capri DataFrame grouped by cluster_id gb_other : pandas DataFrame DataFrame of clusters not in the top cluster ranking cl_rank : dict @@ -547,11 +547,11 @@ def scatter_plot_data( Returns ------- gb_cluster : pandas DataFrameGroupBy - capri DataFrame grouped by cluster-id + capri DataFrame grouped by cluster_id gb_other : pandas DataFrame DataFrame of clusters not in the top cluster ranking """ - gb_cluster = capri_df.groupby("cluster-id") + gb_cluster = capri_df.groupby("cluster_id") gb_other = pd.DataFrame([]) for cl_id, cl_df in gb_cluster: if cl_id not in cl_rank.keys(): @@ -720,7 +720,7 @@ def find_best_struct(ss_file, number_of_struct=10): """ Find best structures. - It inspects model-cluster-ranking recorded in capri_ss.tsv file and finds + It inspects model-cluster_ranking recorded in capri_ss.tsv file and finds the best models (models with lower ranks). By default, it selects the 10 best models. @@ -729,7 +729,7 @@ def find_best_struct(ss_file, number_of_struct=10): ss_file : path path to capri_ss.tsv number_of_struct: int - number of models with lower model-cluster-ranking + number of models with lower model-cluster_ranking Returns ------- @@ -737,12 +737,12 @@ def find_best_struct(ss_file, number_of_struct=10): DataFrame of best structures """ dfss = read_capri_table(ss_file) - dfss = dfss.sort_values(by=["cluster-id", "model-cluster-ranking"]) + dfss = dfss.sort_values(by=["cluster_id", "model-cluster_ranking"]) # TODO need a check for "Unclustered" # count values within each cluster - # and select the column model-cluster-ranking - dfss_grouped = dfss.groupby("cluster-id").count()["model-cluster-ranking"] + # and select the column model-cluster_ranking + dfss_grouped = dfss.groupby("cluster_id").count()["model-cluster_ranking"] # number of structs can be different per each cluster, # so min value is picked here @@ -752,11 +752,11 @@ def find_best_struct(ss_file, number_of_struct=10): number_of_struct = min(number_of_struct, max_number_of_struct) # select the best `number_of_struct` e.g. 4 structures for each cluster - best_struct_df = dfss.groupby("cluster-id").head(number_of_struct).copy() + best_struct_df = dfss.groupby("cluster_id").head(number_of_struct).copy() # define names for best structures, e.g. # Nr 1 best structure, Nr 2 best structure, ... - number_of_cluster = len(best_struct_df["cluster-id"].unique()) + number_of_cluster = len(best_struct_df["cluster_id"].unique()) # zero pad number so after pivot columns are sorted correctly col_names = [ f"Nr {(number + 1):02d} best structure" for number in range(number_of_struct) @@ -765,10 +765,10 @@ def find_best_struct(ss_file, number_of_struct=10): # add a new column `Structure` to the dataframe best_struct_df = best_struct_df.assign(Structure=col_names) - # reshape data frame where columns are cluster-id, cluster-ranking, + # reshape data frame where columns are cluster_id, cluster_ranking, # model,.., Nr 1 best structure, Nr 2 best structure, ... best_struct_df = best_struct_df.pivot_table( - index=["cluster-id", "cluster-ranking"], + index=["cluster_id", "cluster_ranking"], columns=["Structure"], values="model", aggfunc=lambda x: x, @@ -776,7 +776,7 @@ def find_best_struct(ss_file, number_of_struct=10): best_struct_df.reset_index(inplace=True) # Rename columns - columns = {"cluster-id": "Cluster ID", "cluster-ranking": "Cluster Rank"} + columns = {"cluster_id": "Cluster ID", "cluster_ranking": "Cluster Rank"} best_struct_df.rename(columns=columns, inplace=True) # unclustered id is "-", it is replaced by "Unclustered" diff --git a/src/haddock/modules/analysis/caprieval/capri.py b/src/haddock/modules/analysis/caprieval/capri.py index 9db7261ea..af1468ddc 100644 --- a/src/haddock/modules/analysis/caprieval/capri.py +++ b/src/haddock/modules/analysis/caprieval/capri.py @@ -492,13 +492,13 @@ def make_output(self) -> None: data["dockq"] = self.dockq if self.has_cluster_info(): - data["cluster-id"] = self.model.clt_id - data["cluster-ranking"] = self.model.clt_rank - data["model-cluster-ranking"] = self.model.clt_model_rank + data["cluster_id"] = self.model.clt_id + data["cluster_ranking"] = self.model.clt_rank + data["model-cluster_ranking"] = self.model.clt_model_rank else: - data["cluster-id"] = None - data["cluster-ranking"] = None - data["model-cluster-ranking"] = None + data["cluster_id"] = None + data["cluster_ranking"] = None + data["model-cluster_ranking"] = None # energies if self.model.unw_energies: @@ -736,7 +736,7 @@ def rearrange_ss_capri_output( log.info(f"Rearranging output files into {output_fname}") keyword = output_name.split(".")[0] split_dict = { - "capri_ss": "model-cluster-ranking", + "capri_ss": "model-cluster_ranking", "capri_clt": "caprieval_rank", } if keyword not in split_dict.keys(): diff --git a/tests/data/capri_ss_-cluster.tsv b/tests/data/capri_ss_-cluster.tsv index 06b6a1b2f..2a96ed55c 100644 --- a/tests/data/capri_ss_-cluster.tsv +++ b/tests/data/capri_ss_-cluster.tsv @@ -1,4 +1,4 @@ -model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster-id cluster-ranking model-cluster-ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs +model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster_id cluster_ranking model-cluster_ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs ../../01_rigidbody/rigidbody_6.pdb - 1 -16.799 15.821 0.000 25.530 28.835 0.036 - - - 199.509 0.000 0.000 730.249 0.000 0.000 0.000 -7.606 0.000 -3.795 0.000 0.000 0.000 186.678 -9.036 0.000 0.000 ../../01_rigidbody/rigidbody_16.pdb - 2 -4.143 15.076 0.000 24.750 25.979 0.038 - - - 466.839 0.000 0.000 1065.700 0.000 0.000 0.000 3.139 0.000 -1.275 0.000 0.000 0.000 463.685 -1.880 0.000 0.000 ../../01_rigidbody/rigidbody_20.pdb - 3 -3.712 15.507 0.000 25.503 31.745 0.036 - - - 286.218 0.000 0.000 621.882 0.000 0.000 0.000 2.391 0.000 -2.698 0.000 0.000 0.000 278.677 -4.842 0.000 0.000 diff --git a/tests/golden_data/capri_ss_example.tsv b/tests/golden_data/capri_ss_example.tsv index 348db183d..136cda8d6 100644 --- a/tests/golden_data/capri_ss_example.tsv +++ b/tests/golden_data/capri_ss_example.tsv @@ -1,4 +1,4 @@ -model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster-id cluster-ranking model-cluster-ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs +model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster_id cluster_ranking model-cluster_ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs ../1_rigidbody/rigidbody_373.pdb - 1 -37.710 3.031 0.333 5.487 4.130 0.412 16 1 1 147.270 0.000 0.000 1209.810 0.000 0.000 0.000 -15.058 0.000 -12.018 0.000 0.000 0.000 134.324 -0.929 0.000 0.000 ../1_rigidbody/rigidbody_339.pdb - 2 -37.537 3.109 0.333 5.581 4.111 0.407 16 1 2 112.421 0.000 0.000 1188.850 0.000 0.000 0.000 -14.864 0.000 -11.884 0.000 0.000 0.000 98.101 -2.435 0.000 0.000 ../1_rigidbody/rigidbody_383.pdb - 3 -36.098 2.747 0.306 4.787 3.680 0.431 1 2 1 117.214 0.000 0.000 977.829 0.000 0.000 0.000 -16.870 0.000 -10.470 0.000 0.000 0.000 91.558 -15.187 0.000 0.000 diff --git a/tests/test_module_caprieval.py b/tests/test_module_caprieval.py index b16c66f66..7acdf8b02 100644 --- a/tests/test_module_caprieval.py +++ b/tests/test_module_caprieval.py @@ -326,7 +326,7 @@ def test_make_output(protprot_caprimodule): observed_outf_l = read_capri_file(ss_fname) expected_outf_l = [ ['md5', 'caprieval_rank', 'score', 'irmsd', 'fnat', 'lrmsd', 'ilrmsd', - 'dockq', 'cluster-id', 'cluster-ranking', 'model-cluster-ranking'], + 'dockq', 'cluster_id', 'cluster_ranking', 'model-cluster_ranking'], ['-', '-', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', '1', '1', '10'], ] assert observed_outf_l == expected_outf_l @@ -451,8 +451,8 @@ def test_rearrange_ss_capri_output(): with open(f"{golden_data}/capri_ss_1.tsv", 'w') as fh: fh.write( "model caprieval_rank score irmsd fnat lrmsd ilrmsd " - "dockq cluster-id cluster-ranking " - "model-cluster-ranking" + os.linesep) + "dockq cluster_id cluster_ranking " + "model-cluster_ranking" + os.linesep) fh.write( "../1_emscoring/emscoring_909.pdb 1 -424.751 0.000 " "1.000 0.000 0.000 1.000 - - -" + os.linesep)