Skip to content

Commit

Permalink
Merge pull request #805 from haddocking/cluster_id
Browse files Browse the repository at this point in the history
changed capri columns
  • Loading branch information
mgiulini authored Feb 19, 2024
2 parents 42c2c14 + df70cac commit b297c2f
Showing 7 changed files with 46 additions and 46 deletions.
4 changes: 2 additions & 2 deletions src/haddock/clis/cli_analyse.py
Original file line number Diff line number Diff line change
@@ -318,11 +318,11 @@ def zip_top_ranked(capri_filename: FilePath, cluster_ranking: ClRank, summary_na
path to the zipped file
"""
capri_df = read_capri_table(capri_filename, comment="#")
gb_cluster = capri_df.groupby("cluster-id")
gb_cluster = capri_df.groupby("cluster_id")
for cl_id, cl_df in gb_cluster:
if cl_id in cluster_ranking.keys():
if cl_id != "-":
structs = cl_df.loc[cl_df["model-cluster-ranking"] <= 4][["model", "model-cluster-ranking"]]
structs = cl_df.loc[cl_df["model-cluster_ranking"] <= 4][["model", "model-cluster_ranking"]]
else:
structs = cl_df.loc[cl_df["caprieval_rank"] <= 10][["model", "caprieval_rank"]]
structs.columns = ["model", "rank"]
22 changes: 11 additions & 11 deletions src/haddock/libs/libinteractive.py
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ def handle_ss_file(
"""
# now we want to calculate mean and std dev of the scores on df_ss
# first groupby score
df_ss_grouped = df_ss.groupby("cluster-ranking")
df_ss_grouped = df_ss.groupby("cluster_ranking")
# sort the dataframe by score
df_ss.sort_values(by=["score"], inplace=True)
# calculate the mean and standard deviation of the first 4 elements
@@ -43,10 +43,10 @@ def handle_ss_file(
# get the index that sorts the array by the first column
clt_ranks = np.argsort(new_values[:, 0])
# adjust clustering values if there are clusters
if list(np.unique(df_ss["cluster-id"])) != ["-"]:
df_ss['model-cluster-ranking'] = df_ss.groupby('cluster-id')['score'].rank(ascending=True).astype(int) # noqa : E501
# assign to the values of cluster-ranking the corresponding clt_ranks
df_ss["cluster-ranking"] = df_ss["cluster-ranking"].apply(lambda x: clt_ranks[x - 1] + 1) # noqa : E501
if list(np.unique(df_ss["cluster_id"])) != ["-"]:
df_ss['model-cluster_ranking'] = df_ss.groupby('cluster_id')['score'].rank(ascending=True).astype(int) # noqa : E501
# assign to the values of cluster_ranking the corresponding clt_ranks
df_ss["cluster_ranking"] = df_ss["cluster_ranking"].apply(lambda x: clt_ranks[x - 1] + 1) # noqa : E501
# assign to the column caprieval_rank the index of the dataframe
df_ss.index = range(1, len(df_ss) + 1)
df_ss["caprieval_rank"] = df_ss.index
@@ -82,13 +82,13 @@ def rewrite_capri_tables(
f"../{model.path.split('/')[-1]}/{model.file_name}"
for model in clt_dic[cl]
]
# all the models should now have the cluster-id field
df_ss.loc[df_ss['model'].isin(models), 'cluster-id'] = cl
# all the models should now have the cluster_id field
df_ss.loc[df_ss['model'].isin(models), 'cluster_id'] = cl

# delete all the models that are not in the clusters
df_ss = df_ss[df_ss['cluster-id'] != "-"]
# assign cluster-ranking to cluster-id (aka random assignment)
df_ss['cluster-ranking'] = df_ss['cluster-id']
df_ss = df_ss[df_ss['cluster_id'] != "-"]
# assign cluster_ranking to cluster_id (aka random assignment)
df_ss['cluster_ranking'] = df_ss['cluster_id']
# handle ss file
df_ss, clt_ranks, _new_values = handle_ss_file(df_ss)

@@ -154,7 +154,7 @@ def handle_clt_file(df_ss, clt_ranks):
"""
capri_keys = ["score", "irmsd", "fnat", "lrmsd", "dockq"]
model_keys = ["air", "bsa", "desolv", "elec", "total", "vdw"]
df_ss_grouped = df_ss.groupby("cluster-id")
df_ss_grouped = df_ss.groupby("cluster_id")
# loop over df_ss_grouped
cl_data = []
for i, clt_id in enumerate(df_ss_grouped):
42 changes: 21 additions & 21 deletions src/haddock/libs/libplots.py
Original file line number Diff line number Diff line change
@@ -262,19 +262,19 @@ def box_plot_plotly(
color_idx = (cl_rank[cl_id] - 1) % len(colors) # color index

# Note: the rank format (float/int) in "cl_rank" is different from
# gb_full["cluster-ranking"]
rns = gb_full[gb_full["cluster-id"] == cl_id]["cluster-ranking"]
# gb_full["cluster_ranking"]
rns = gb_full[gb_full["cluster_id"] == cl_id]["cluster_ranking"]
rn = rns.unique()[0]
color_map[f"{rn}"] = colors[color_idx]

# Choose a different color for "Other" like in scatter plots
color_map["Other"] = "#DDDBDA"

# to use color_discrete_map, cluster-ranking column should be str not int
gb_full_string = gb_full.astype({"cluster-ranking": "string"})
# to use color_discrete_map, cluster_ranking column should be str not int
gb_full_string = gb_full.astype({"cluster_ranking": "string"})

# Rename for a better name in legend
gb_full_string.rename(columns={"cluster-ranking": "Cluster Rank"}, inplace=True)
gb_full_string.rename(columns={"cluster_ranking": "Cluster Rank"}, inplace=True)

# "Cluster Rank" is equivalent to "capri_rank"!
fig = px.box(
@@ -319,7 +319,7 @@ def box_plot_data(capri_df: pd.DataFrame, cl_rank: ClRank) -> pd.DataFrame:
gb_full : pandas DataFrame
DataFrame of all the clusters to be plotted
"""
gb_cluster = capri_df.groupby("cluster-id")
gb_cluster = capri_df.groupby("cluster_id")
gb_other = pd.DataFrame([])
gb_good = pd.DataFrame([])
for cl_id, cl_df in gb_cluster:
@@ -329,9 +329,9 @@ def box_plot_data(capri_df: pd.DataFrame, cl_rank: ClRank) -> pd.DataFrame:
cl_df["capri_rank"] = cl_rank[cl_id] # type: ignore
gb_good = pd.concat([gb_good, cl_df])

gb_other["cluster-id"] = "Other"
gb_other["cluster_id"] = "Other"
gb_other["capri_rank"] = len(cl_rank.keys()) + 1
gb_other["cluster-ranking"] = "Other"
gb_other["cluster_ranking"] = "Other"
gb_full = pd.concat([gb_good, gb_other])

# Sort based on "capri_rank"
@@ -392,7 +392,7 @@ def scatter_plot_plotly(
Parameters
----------
gb_cluster : pandas DataFrameGroupBy
capri DataFrame grouped by cluster-id
capri DataFrame grouped by cluster_id
gb_other : pandas DataFrame
DataFrame of clusters not in the top cluster ranking
cl_rank : dict
@@ -547,11 +547,11 @@ def scatter_plot_data(
Returns
-------
gb_cluster : pandas DataFrameGroupBy
capri DataFrame grouped by cluster-id
capri DataFrame grouped by cluster_id
gb_other : pandas DataFrame
DataFrame of clusters not in the top cluster ranking
"""
gb_cluster = capri_df.groupby("cluster-id")
gb_cluster = capri_df.groupby("cluster_id")
gb_other = pd.DataFrame([])
for cl_id, cl_df in gb_cluster:
if cl_id not in cl_rank.keys():
@@ -720,7 +720,7 @@ def find_best_struct(ss_file, number_of_struct=10):
"""
Find best structures.
It inspects model-cluster-ranking recorded in capri_ss.tsv file and finds
It inspects model-cluster_ranking recorded in capri_ss.tsv file and finds
the best models (models with lower ranks).
By default, it selects the 10 best models.
@@ -729,20 +729,20 @@ def find_best_struct(ss_file, number_of_struct=10):
ss_file : path
path to capri_ss.tsv
number_of_struct: int
number of models with lower model-cluster-ranking
number of models with lower model-cluster_ranking
Returns
-------
best_struct_df : pandas DataFrame
DataFrame of best structures
"""
dfss = read_capri_table(ss_file)
dfss = dfss.sort_values(by=["cluster-id", "model-cluster-ranking"])
dfss = dfss.sort_values(by=["cluster_id", "model-cluster_ranking"])
# TODO need a check for "Unclustered"

# count values within each cluster
# and select the column model-cluster-ranking
dfss_grouped = dfss.groupby("cluster-id").count()["model-cluster-ranking"]
# and select the column model-cluster_ranking
dfss_grouped = dfss.groupby("cluster_id").count()["model-cluster_ranking"]

# number of structs can be different per each cluster,
# so min value is picked here
@@ -752,11 +752,11 @@ def find_best_struct(ss_file, number_of_struct=10):
number_of_struct = min(number_of_struct, max_number_of_struct)

# select the best `number_of_struct` e.g. 4 structures for each cluster
best_struct_df = dfss.groupby("cluster-id").head(number_of_struct).copy()
best_struct_df = dfss.groupby("cluster_id").head(number_of_struct).copy()

# define names for best structures, e.g.
# Nr 1 best structure, Nr 2 best structure, ...
number_of_cluster = len(best_struct_df["cluster-id"].unique())
number_of_cluster = len(best_struct_df["cluster_id"].unique())
# zero pad number so after pivot columns are sorted correctly
col_names = [
f"Nr {(number + 1):02d} best structure" for number in range(number_of_struct)
@@ -765,18 +765,18 @@ def find_best_struct(ss_file, number_of_struct=10):
# add a new column `Structure` to the dataframe
best_struct_df = best_struct_df.assign(Structure=col_names)

# reshape data frame where columns are cluster-id, cluster-ranking,
# reshape data frame where columns are cluster_id, cluster_ranking,
# model,.., Nr 1 best structure, Nr 2 best structure, ...
best_struct_df = best_struct_df.pivot_table(
index=["cluster-id", "cluster-ranking"],
index=["cluster_id", "cluster_ranking"],
columns=["Structure"],
values="model",
aggfunc=lambda x: x,
)

best_struct_df.reset_index(inplace=True)
# Rename columns
columns = {"cluster-id": "Cluster ID", "cluster-ranking": "Cluster Rank"}
columns = {"cluster_id": "Cluster ID", "cluster_ranking": "Cluster Rank"}
best_struct_df.rename(columns=columns, inplace=True)

# unclustered id is "-", it is replaced by "Unclustered"
14 changes: 7 additions & 7 deletions src/haddock/modules/analysis/caprieval/capri.py
Original file line number Diff line number Diff line change
@@ -492,13 +492,13 @@ def make_output(self) -> None:
data["dockq"] = self.dockq

if self.has_cluster_info():
data["cluster-id"] = self.model.clt_id
data["cluster-ranking"] = self.model.clt_rank
data["model-cluster-ranking"] = self.model.clt_model_rank
data["cluster_id"] = self.model.clt_id
data["cluster_ranking"] = self.model.clt_rank
data["model-cluster_ranking"] = self.model.clt_model_rank
else:
data["cluster-id"] = None
data["cluster-ranking"] = None
data["model-cluster-ranking"] = None
data["cluster_id"] = None
data["cluster_ranking"] = None
data["model-cluster_ranking"] = None

# energies
if self.model.unw_energies:
@@ -736,7 +736,7 @@ def rearrange_ss_capri_output(
log.info(f"Rearranging output files into {output_fname}")
keyword = output_name.split(".")[0]
split_dict = {
"capri_ss": "model-cluster-ranking",
"capri_ss": "model-cluster_ranking",
"capri_clt": "caprieval_rank",
}
if keyword not in split_dict.keys():
2 changes: 1 addition & 1 deletion tests/data/capri_ss_-cluster.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster-id cluster-ranking model-cluster-ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs
model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster_id cluster_ranking model-cluster_ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs
../../01_rigidbody/rigidbody_6.pdb - 1 -16.799 15.821 0.000 25.530 28.835 0.036 - - - 199.509 0.000 0.000 730.249 0.000 0.000 0.000 -7.606 0.000 -3.795 0.000 0.000 0.000 186.678 -9.036 0.000 0.000
../../01_rigidbody/rigidbody_16.pdb - 2 -4.143 15.076 0.000 24.750 25.979 0.038 - - - 466.839 0.000 0.000 1065.700 0.000 0.000 0.000 3.139 0.000 -1.275 0.000 0.000 0.000 463.685 -1.880 0.000 0.000
../../01_rigidbody/rigidbody_20.pdb - 3 -3.712 15.507 0.000 25.503 31.745 0.036 - - - 286.218 0.000 0.000 621.882 0.000 0.000 0.000 2.391 0.000 -2.698 0.000 0.000 0.000 278.677 -4.842 0.000 0.000
2 changes: 1 addition & 1 deletion tests/golden_data/capri_ss_example.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster-id cluster-ranking model-cluster-ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs
model md5 caprieval_rank score irmsd fnat lrmsd ilrmsd dockq cluster_id cluster_ranking model-cluster_ranking air angles bonds bsa cdih coup dani desolv dihe elec improper rdcs rg total vdw vean xpcs
../1_rigidbody/rigidbody_373.pdb - 1 -37.710 3.031 0.333 5.487 4.130 0.412 16 1 1 147.270 0.000 0.000 1209.810 0.000 0.000 0.000 -15.058 0.000 -12.018 0.000 0.000 0.000 134.324 -0.929 0.000 0.000
../1_rigidbody/rigidbody_339.pdb - 2 -37.537 3.109 0.333 5.581 4.111 0.407 16 1 2 112.421 0.000 0.000 1188.850 0.000 0.000 0.000 -14.864 0.000 -11.884 0.000 0.000 0.000 98.101 -2.435 0.000 0.000
../1_rigidbody/rigidbody_383.pdb - 3 -36.098 2.747 0.306 4.787 3.680 0.431 1 2 1 117.214 0.000 0.000 977.829 0.000 0.000 0.000 -16.870 0.000 -10.470 0.000 0.000 0.000 91.558 -15.187 0.000 0.000
6 changes: 3 additions & 3 deletions tests/test_module_caprieval.py
Original file line number Diff line number Diff line change
@@ -326,7 +326,7 @@ def test_make_output(protprot_caprimodule):
observed_outf_l = read_capri_file(ss_fname)
expected_outf_l = [
['md5', 'caprieval_rank', 'score', 'irmsd', 'fnat', 'lrmsd', 'ilrmsd',
'dockq', 'cluster-id', 'cluster-ranking', 'model-cluster-ranking'],
'dockq', 'cluster_id', 'cluster_ranking', 'model-cluster_ranking'],
['-', '-', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', '1', '1', '10'], ]

assert observed_outf_l == expected_outf_l
@@ -451,8 +451,8 @@ def test_rearrange_ss_capri_output():
with open(f"{golden_data}/capri_ss_1.tsv", 'w') as fh:
fh.write(
"model caprieval_rank score irmsd fnat lrmsd ilrmsd "
"dockq cluster-id cluster-ranking "
"model-cluster-ranking" + os.linesep)
"dockq cluster_id cluster_ranking "
"model-cluster_ranking" + os.linesep)
fh.write(
"../1_emscoring/emscoring_909.pdb 1 -424.751 0.000 "
"1.000 0.000 0.000 1.000 - - -" + os.linesep)

0 comments on commit b297c2f

Please sign in to comment.