Skip to content

Commit

Permalink
integrate better sequence weighting into alignment protocols
Browse files Browse the repository at this point in the history
  • Loading branch information
thomashopf committed Nov 28, 2024
1 parent 5499c50 commit 08d3cf3
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 10 deletions.
8 changes: 5 additions & 3 deletions config/sample_config_monomer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,11 @@ align:
# sequence database (specify possible databases and paths in "databases" section below)
database: uniref100

# compute the redundancy-reduced number of effective sequences (M_eff) already in the alignment stage.
# To save compute time, this computation is normally carried out in the couplings stage
compute_num_effective_seqs: False
# Method for sequence weight calculation. Weights calculated here will be reused by plmc in the couplings stage
# (requiring up-to-date plmc version supporting --weights command line argument)
# Options: nogaps (recommended new strategy, parallelized), withgaps (legacy strategy, parallelized),
# legacy: old single-threaded implementation, empty/none (do not compute sequence weights in align stage)
sequence_weights: nogaps

# Filter sequence alignment at this % sequence identity cutoff. Can be used to cut computation time in
# the couplings stage (e.g. set to 95 to remove any sequence that is more than 95% identical to a sequence
Expand Down
38 changes: 32 additions & 6 deletions evcouplings/align/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,8 +952,10 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, *

# compute effective number of sequences
# (this is intended for cases where coupling stage is
# not run, but this number is wanted nonetheless)
if kwargs["compute_num_effective_seqs"]:
# not run, but this number is wanted nonetheless);
# handles legacy style sequence weights (not reused in couplings stage) as well
# as new-style sequence weights (reused by plmc in couplings stage)
if kwargs["compute_num_effective_seqs"] or kwargs.get("sequence_weights"):
# make sure we only compute N_eff on the columns
# that would be used for model inference, dispose
# the rest
Expand All @@ -962,8 +964,32 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, *
else:
cut_ali = ali.select(columns=~lc_cols)

# compute sequence weights
cut_ali.set_weights(kwargs["theta"])
# handle case where we want to forward sequence weights to couplings stage
# (i.e. not for old compute_num_effective_seqs parameter)
if kwargs.get("sequence_weights") is not None:
seq_weight_file = prefix + "_sequence_weights.csv"

# compute sequence weights, reuse existing weights file if available
if kwargs.get("reuse_alignment") and valid_file(seq_weight_file):
# reload weights
with open(seq_weight_file) as f:
cut_ali.set_weights(weight_fileobj=f)
else:
# compute weights from scratch; default to legacy strategy if sequence_weights argument
# not specified (for compute_num_effective_seqs case)
cut_ali.set_weights(
identity_threshold=kwargs["theta"],
method=kwargs.get("sequence_weights", "legacy"),
cpu=kwargs.get("cpu"),
)

# save weights to file for reuse by plmc (one weight per line in text format)
with open(seq_weight_file, "w") as f:
cut_ali.save_weights(seq_weight_file)

# add sequence weight file to outcfg to forward to couplings stage
if seq_weight_file is not None:
outcfg["sequence_weight_file"] = seq_weight_file

# N_eff := sum of all sequence weights
n_eff = float(cut_ali.weights.sum())
Expand All @@ -979,9 +1005,9 @@ def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, *
})

# save sequence weights to file and add to output config
outcfg["sequence_weights_file"] = prefix + "_inverse_sequence_weights.csv"
outcfg["inverse_sequence_weight_file"] = prefix + "_inverse_sequence_weights.csv"
inv_seq_weights.to_csv(
outcfg["sequence_weights_file"], index=False
outcfg["inverse_sequence_weight_file"], index=False
)
else:
n_eff = None
Expand Down
2 changes: 1 addition & 1 deletion evcouplings/couplings/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def infer_plmc(**kwargs):
segments (passed through)
"""
# note: sequence_weights_file not enforced here for backwards compatibility
# note: sequence_weight_file not enforced here for backwards compatibility
check_required(
kwargs,
[
Expand Down

0 comments on commit 08d3cf3

Please sign in to comment.