Skip to content

Commit

Permalink
Keep the linter happy
Browse files Browse the repository at this point in the history
  • Loading branch information
DrYak committed Jun 7, 2024
1 parent 04ca094 commit 9ab7264
Show file tree
Hide file tree
Showing 23 changed files with 401 additions and 225 deletions.
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
# Local haplotype reconstruction benchmark

This repository stores the scripts and notebooks used to conduct the benchmark study presented in the manuscript: XXX

To reproduce the benchmark study and create the figures presented in the manuscript, use the following instructions:
1. Clone the repository of V-pipe 3.0 into your working directory:
```
git clone https://github.com/cbg-ethz/V-pipe.git
```
```shell
git clone https://github.com/cbg-ethz/V-pipe.git
```
2. Go into the directory of the benchmarking study for the local haplotype reconstruction:
```
cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
```
```shell
cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
```
3. The parameters to reproduce the synthetic datasets are in the parameter files `config_xxx/params.csv` with the configuration file `config_xxx/config.yaml` where simulation mode, replicate number and methods to be executed are defined.
4. The methods to execute must be define in a Python script in this directory:
```
V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions.
```
```shell
V-pipe/resources/auxiliary_workflows/benchmark/resources/method_definitions.
```
5. Now the workflow is ready, go back to the directory
```
cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
```
```shell
cd V-pipe/resources/auxiliary_workflows/benchmark/resources/local_haplo_setup
```
6. To install the needed Conda environments execute:
```
snakemake --conda-create-envs-only --use-conda -c1
```
```shell
snakemake --conda-create-envs-only --use-conda -c1
```
7. To submit the workflow to a lsf-cluster execute `./run_workflow.sh`, otherwise execute the workflow with `snakemake --use-conda -c1`
8. The workflow will provide the results in the directory `results`.
9. When the workflow has terminated and all result files were generated, figures from the manuscript can be generated by executing the notebooks in `workflow/notebooks/`.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ def convert_vcf(fname):
variant_list.add(f"{zero_based_pos}{base}")
return variant_list


def convert_groundtruth(fname):
df = pd.read_csv(fname, index_col=0)
return set((df["position"].astype(str) + df["variant"]).tolist())


def mutation_calls_details(vcf_list, groundtruth_list):
# compute performance
tmp = []
Expand All @@ -45,11 +47,16 @@ def mutation_calls_details(vcf_list, groundtruth_list):
predicted_variants = convert_vcf(fname_vcf)

# iter through ground truth mutations
set_true_variants = [str(gt_row["position"]) + gt_row["variant"] for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows()]
set_true_variants = [
str(gt_row["position"]) + gt_row["variant"]
for iter_row, gt_row in pd.read_csv(
fname_groundtruth, index_col=0
).iterrows()
]
for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows():
true_variant = str(gt_row["position"]) + gt_row["variant"]
mutation_type = str(gt_row["type"])
if method.startswith(('lofreq', 'shorah', 'viloca')):
if method.startswith(("lofreq", "shorah", "viloca")):
is_false_negative = True
for variant in VCF(fname_vcf):
for idx_base, base in enumerate(variant.ALT):
Expand All @@ -59,14 +66,20 @@ def mutation_calls_details(vcf_list, groundtruth_list):
if true_variant == predicted_variant:
is_false_negative = False
# get pred_freq
if method.startswith('lofreq'):
if method.startswith("lofreq"):
posterior = 1.0
freq = variant.INFO.get('AF')#vi.split(',')[idx_base]
elif method.startswith(('shorah', 'viloca')):
freq = variant.INFO.get("AF") # vi.split(',')[idx_base]
elif method.startswith(("shorah", "viloca")):
if mutation_type != "insertion":
freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot')))
freq = (
int(variant.INFO.get("Fvar"))
+ int(variant.INFO.get("Rvar"))
) / (
int(variant.INFO.get("Ftot"))
+ int(variant.INFO.get("Rtot"))
)
elif mutation_type == "insertion":
freq = (float(variant.INFO.get('Freq1')))
freq = float(variant.INFO.get("Freq1"))

tmp.append(
{
Expand Down Expand Up @@ -125,6 +138,7 @@ def main(vcf_list, groundtruth_list, fname_out):
df = mutation_calls_details(vcf_list, groundtruth_list)
df.to_csv(fname_out)


if __name__ == "__main__":
main(
[Path(e) for e in snakemake.input.vcf_list],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
import pandas as pd


def runtime(benchmark_list, fname_out):
# gather benchmark information
tmp = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ def convert_vcf(fname):
variant_list.add(f"{zero_based_pos}{base}")
return variant_list


def convert_groundtruth(fname):
df = pd.read_csv(fname, index_col=0)
return set((df["position"].astype(str) + df["variant"]).tolist())


def mutation_calls_details(vcf_list, groundtruth_list):
# compute performance
tmp = []
Expand All @@ -48,7 +50,7 @@ def mutation_calls_details(vcf_list, groundtruth_list):
for iter_row, gt_row in pd.read_csv(fname_groundtruth, index_col=0).iterrows():
true_variant = str(gt_row["position"]) + gt_row["variant"]
mutation_type = str(gt_row["type"])
if method.startswith(('lofreq', 'shorah')):
if method.startswith(("lofreq", "shorah")):
is_false_negative = True
for variant in VCF(fname_vcf):
for idx_base, base in enumerate(variant.ALT):
Expand All @@ -58,12 +60,18 @@ def mutation_calls_details(vcf_list, groundtruth_list):
if true_variant == predicted_variant:
is_false_negative = False
# get pred_freq
if method.startswith('lofreq'):
if method.startswith("lofreq"):
posterior = 1.0
freq = variant.INFO.get('AF')#vi.split(',')[idx_base]
elif method.startswith('shorah'):
freq = variant.INFO.get("AF") # vi.split(',')[idx_base]
elif method.startswith("shorah"):
posterior = 1.0
freq = (int(variant.INFO.get('Fvar')) + int(variant.INFO.get('Rvar'))) / (int(variant.INFO.get('Ftot')) + int(variant.INFO.get('Rtot')))
freq = (
int(variant.INFO.get("Fvar"))
+ int(variant.INFO.get("Rvar"))
) / (
int(variant.INFO.get("Ftot"))
+ int(variant.INFO.get("Rtot"))
)

tmp.append(
{
Expand Down Expand Up @@ -99,23 +107,41 @@ def mutation_calls_details(vcf_list, groundtruth_list):
}
)

elif method.startswith('viloca'):

colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval']
elif method.startswith("viloca"):

colnames = [
"Chromosome",
"Pos",
"Ref",
"Alt",
"Frq",
"Pst",
"Rvar",
"Fvar",
"Rtot",
"Ftot",
"Qval",
]
# we want the file SNVs_0.010000.tsv as the posterior filter was not applied here
fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t")
df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce')
fname_SNVs_correct = (
str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
)
df_predicted = pd.read_csv(
fname_SNVs_correct, names=colnames, header=None, sep="\t"
)
df_predicted["Pst"] = pd.to_numeric(
df_predicted["Pst"], errors="coerce"
)

for iter_row_pred, pred_row in df_predicted.iterrows():
zero_based_pos = int(pred_row['Pos']) - 1 # VCF is 1-based
zero_based_pos = int(pred_row["Pos"]) - 1 # VCF is 1-based
is_false_negative = True
predicted_variant = f"{zero_based_pos}{pred_row['Alt']}"

if true_variant == predicted_variant:
is_false_negative = False
posterior = pred_row['Pst']
freq = pred_row['Frq']
posterior = pred_row["Pst"]
freq = pred_row["Frq"]

tmp.append(
{
Expand Down Expand Up @@ -158,6 +184,7 @@ def main(vcf_list, groundtruth_list, fname_out):
df = mutation_calls_details(vcf_list, groundtruth_list)
df.to_csv(fname_out)


if __name__ == "__main__":
main(
[Path(e) for e in snakemake.input.vcf_list],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ def convert_vcf(df):
variant_list = set()

for idx_row, variant in df.iterrows():
for base in variant['Alt']:
zero_based_pos = int(variant['Pos']) - 1 # VCF is 1-based
for base in variant["Alt"]:
zero_based_pos = int(variant["Pos"]) - 1 # VCF is 1-based
variant_list.add(f"{zero_based_pos}{base}")
return variant_list


def convert_groundtruth(fname):
df = pd.read_csv(fname, index_col=0)
return set((df["position"].astype(str) + df["variant"]).tolist())
Expand Down Expand Up @@ -76,20 +77,38 @@ def compute_performance(true_variants, predicted_variants):

def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
# compute performance
colnames=['Chromosome', 'Pos', 'Ref', 'Alt', 'Frq', 'Pst', 'Rvar', 'Fvar', 'Rtot', 'Ftot', 'Qval']
colnames = [
"Chromosome",
"Pos",
"Ref",
"Alt",
"Frq",
"Pst",
"Rvar",
"Fvar",
"Rtot",
"Ftot",
"Qval",
]
tmp = []
fps_tmp = []
for fname_vcf, fname_groundtruth in zip(vcf_list, groundtruth_list):

# we want the file SNVs_0.010000.tsv as the posterior filter was not applied here
fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
fname_SNVs_correct = (
str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000.tsv"
)
if os.path.isfile(fname_SNVs_correct):
df_predicted = pd.read_csv(fname_SNVs_correct, names=colnames, header=None ,sep="\t")
df_predicted = pd.read_csv(
fname_SNVs_correct, names=colnames, header=None, sep="\t"
)
else:
fname_SNVs_correct = str(fname_vcf).split("snvs_.vcf")[0] + "work/snv/SNVs_0.010000_final.csv"
fname_SNVs_correct = (
str(fname_vcf).split("snvs_.vcf")[0]
+ "work/snv/SNVs_0.010000_final.csv"
)
df_predicted = pd.read_csv(fname_SNVs_correct)
df_predicted = df_predicted.rename(columns={'Var': 'Alt',
'Pst2': 'Pst'})
df_predicted = df_predicted.rename(columns={"Var": "Alt", "Pst2": "Pst"})

parts = str(fname_vcf).split("/")

Expand All @@ -98,11 +117,10 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
elif len(parts) == 8: # for multi workflow
_, _, _, params, method, _, replicate, _ = parts


#filter dataframe according to posterior_threshold
df_predicted['Pst'] = pd.to_numeric(df_predicted['Pst'], errors='coerce')
df_predicted = df_predicted.dropna(subset=['Pst'])
df_predicted = df_predicted[df_predicted['Pst']>posterior_threshold]
# filter dataframe according to posterior_threshold
df_predicted["Pst"] = pd.to_numeric(df_predicted["Pst"], errors="coerce")
df_predicted = df_predicted.dropna(subset=["Pst"])
df_predicted = df_predicted[df_predicted["Pst"] > posterior_threshold]

true_variants = convert_groundtruth(fname_groundtruth)
predicted_variants = convert_vcf(df_predicted)
Expand Down Expand Up @@ -143,7 +161,6 @@ def performance_plots(vcf_list, groundtruth_list, posterior_threshold):
return df_perf



def main(vcf_list, groundtruth_list, fname_out):

posterior_thresholds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
Expand All @@ -152,15 +169,14 @@ def main(vcf_list, groundtruth_list, fname_out):
for posterior_threshold in posterior_thresholds:

df_pos_thres = performance_plots(
vcf_list,
groundtruth_list,
posterior_threshold
)
vcf_list, groundtruth_list, posterior_threshold
)

dfs_tmp.append(df_pos_thres)

pd.concat(dfs_tmp).to_csv(fname_out)


if __name__ == "__main__":
main(
[Path(e) for e in snakemake.input.vcf_list],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main(
check=True,
)

runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h
runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h
cliquesnv_mode = None
if seq_type == "illumina":
cliquesnv_mode = "snv-illumina"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main(
check=True,
)

runtime_limit = 15*24*60*60 - 60*60 # 15 days - 1h
runtime_limit = 15 * 24 * 60 * 60 - 60 * 60 # 15 days - 1h
cliquesnv_mode = None
if seq_type == "illumina":
cliquesnv_mode = "snv-illumina"
Expand Down
Loading

0 comments on commit 9ab7264

Please sign in to comment.