From 08d3cf385b8ae3fade6f77aaefd3a86cfabd52d3 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 28 Nov 2024 14:53:39 +0100 Subject: [PATCH] integrate better sequence weighting into alignment protocols --- config/sample_config_monomer.txt | 8 ++++--- evcouplings/align/protocol.py | 38 ++++++++++++++++++++++++++----- evcouplings/couplings/protocol.py | 2 +- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index fa02863..9741166 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -94,9 +94,11 @@ align: # sequence database (specify possible databases and paths in "databases" section below) database: uniref100 - # compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage. - # To save compute time, this computation is normally carried out in the couplings stage - compute_num_effective_seqs: False + # Method for sequence weight calculation. Weights calculated here will be reused by plmc in the couplings stage + # (requiring up-to-date plmc version supporting --weights command line argument) + # Options: nogaps (recommended new strategy, parallelized), withgaps (legacy strategy, parallelized), + # legacy: old single-threaded implementation, empty/none (do not compute sequence weights in align stage) + sequence_weights: nogaps # Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in # the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 7fa6282..d7f7de5 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -952,8 +952,10 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * # compute effective number of sequences # (this is intended for cases where coupling stage is - # not run, but this number is wanted nonetheless) - if kwargs["compute_num_effective_seqs"]: + # not run, but this number is wanted nonetheless); + # handles legacy style sequence weights (not reused in couplings stage) as well + # as new-style sequence weights (reused by plmc in couplings stage) + if kwargs["compute_num_effective_seqs"] or kwargs.get("sequence_weights"): # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest @@ -962,8 +964,32 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * else: cut_ali = ali.select(columns=~lc_cols) - # compute sequence weights - cut_ali.set_weights(kwargs["theta"]) + # handle case where we want to forward sequence weights to couplings stage + # (i.e. not for old compute_num_effective_seqs parameter) + if kwargs.get("sequence_weights") is not None: + seq_weight_file = prefix + "_sequence_weights.csv" + + # compute sequence weights, reuse existing weights file if available + if kwargs.get("reuse_alignment") and valid_file(seq_weight_file): + # reload weights + with open(seq_weight_file) as f: + cut_ali.set_weights(weight_fileobj=f) + else: + # compute weights from scratch; default to legacy strategy if sequence_weights argument + # not specified (for compute_num_effective_seqs case) + cut_ali.set_weights( + identity_threshold=kwargs["theta"], + method=kwargs.get("sequence_weights", "legacy"), + cpu=kwargs.get("cpu"), + ) + + # save weights to file for reuse by plmc (one weight per line in text format) + with open(seq_weight_file, "w") as f: + cut_ali.save_weights(seq_weight_file) + + # add sequence weight file to outcfg to forward to couplings stage + if seq_weight_file is not None: + outcfg["sequence_weight_file"] = seq_weight_file # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) @@ -979,9 +1005,9 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, * }) # save sequence weights to file and add to output config - outcfg["sequence_weights_file"] = prefix + "_inverse_sequence_weights.csv" + outcfg["inverse_sequence_weight_file"] = prefix + "_inverse_sequence_weights.csv" inv_seq_weights.to_csv( - outcfg["sequence_weights_file"], index=False + outcfg["inverse_sequence_weight_file"], index=False ) else: n_eff = None diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index b5eb770..5ac6247 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -81,7 +81,7 @@ def infer_plmc(**kwargs): segments (passed through) """ - # note: sequence_weights_file not enforced here for backwards compatibility + # note: sequence_weight_file not enforced here for backwards compatibility check_required( kwargs, [