Skip to content

Commit

Permalink
Merge pull request #225 from bioinfo-chru-strasbourg/fix_from_annovar
Browse files Browse the repository at this point in the history
fix from annovar tool #224
  • Loading branch information
antonylebechec authored Apr 18, 2024
2 parents 90fc7a3 + 7e90ad5 commit d3b4ab5
Show file tree
Hide file tree
Showing 14 changed files with 567 additions and 35 deletions.
44 changes: 44 additions & 0 deletions howard/functions/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
from tempfile import NamedTemporaryFile
import tempfile
import typing
import duckdb
import json
import argparse
Expand Down Expand Up @@ -3467,3 +3468,46 @@ def transcripts_file_to_df(transcripts_file: str) -> pd.DataFrame:

# Return
return transcripts_dataframe


def identical(
vcf_list: typing.List[str], begin: str = "##", line_strip: bool = True
) -> bool:
"""
The `identical` function compares the contents of multiple VCF files to determine if they are
identical.
:param vcf_list: The `vcf_list` parameter is a list of file paths to VCF (Variant Call Format) files
that you want to compare for identity. The function reads the contents of these files and checks if
they are identical based on the specified conditions
:type vcf_list: typing.List[str]
:param begin: The `begin` parameter in the `identical` function is used to specify a string that
indicates the beginning of a line in the input files. If a line in the input file starts with the
specified `begin` string, it will be skipped and not included in the comparison process. By default,
defaults to ##
:type begin: str (optional)
:param line_strip: The `line_strip` parameter in the `identical` function is a boolean flag that
determines whether each line read from the input files should be stripped of leading and trailing
whitespaces before being compared. If `line_strip` is set to `True`, each line will be stripped
using the `strip, defaults to True
:type line_strip: bool (optional)
:return: The function `identical` is returning a boolean value. It returns `True` if all the lines
in the VCF files provided in the `vcf_list` are identical, and `False` otherwise.
"""

vcfs_lines = []
k = 0
for vcf in vcf_list:
vcfs_lines.append([])
with open(vcf, "r") as f:
for l in f:
if not l.startswith(begin) or begin == "":
if line_strip:
l = l.strip()
vcfs_lines[k].append(l)
k += 1

for i in range(len(vcf_list) - 1):
if vcfs_lines[i] != vcfs_lines[i + 1]:
return False
return True
11 changes: 8 additions & 3 deletions howard/functions/from_annovar.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,13 +307,15 @@ def annovar_to_vcf(
else:

with open(input_file, "r") as f:

for line in f:
if str(line).startswith("##"):
continue
elif len(line.strip().split("\t")) < 5:
header_ok = False
elif str(line).startswith("#"):
log.debug("Found header")
header_ok = True
header = line.strip().split("\t")
if "#CHROM" not in header:
chrom_list_for_header = [
Expand Down Expand Up @@ -388,7 +390,10 @@ def annovar_to_vcf(
break
f.close()

if not header_ok:
if not header_ok and (
set(["#CHROM", "POS", "REF", "ALT"]).issubset(set(header))
or set(["#CHROM", "START", "END"]).issubset(set(header))
):
raise ValueError("Error in header")

# protect info tag from unauthorized characters
Expand All @@ -413,7 +418,6 @@ def annovar_to_vcf(

# Clean nb header and skip line
nb_skip_line = nb_header_line
nb_header_line -= 1
if nb_header_line <= 0:
nb_header_line = None

Expand All @@ -429,9 +433,10 @@ def annovar_to_vcf(
dtype[h] = column_type

# Check format VCF readable
# nrows = 100000
log.debug("Check input file struct...")
nrows_sampling = 1000000
log.debug(f"nrows_sampling={nrows_sampling}")
log.debug(f"nb_header_line={nb_header_line}")
try:
df = pd.read_csv(
input_file,
Expand Down
10 changes: 10 additions & 0 deletions tests/databases/others/hg19_dbnsfp42a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#Chr Start End Ref Alt SIFT_score SIFT_converted_rankscore SIFT_pred SIFT4G_score SIFT4G_converted_rankscore SIFT4G_pred Polyphen2_HDIV_score Polyphen2_HDIV_rankscore Polyphen2_HDIV_pred Polyphen2_HVAR_score Polyphen2_HVAR_rankscore Polyphen2_HVAR_pred LRT_score LRT_converted_rankscore LRT_pred MutationTaster_score MutationTaster_converted_rankscore MutationTaster_pred MutationAssessor_score MutationAssessor_rankscore MutationAssessor_pred FATHMM_score FATHMM_converted_rankscore FATHMM_pred PROVEAN_score PROVEAN_converted_rankscore PROVEAN_pred VEST4_score VEST4_rankscore MetaSVM_score MetaSVM_rankscore MetaSVM_pred MetaLR_score MetaLR_rankscore MetaLR_pred MetaRNN_score MetaRNN_rankscore MetaRNN_pred M-CAP_score M-CAP_rankscore M-CAP_pred REVEL_score REVEL_rankscore MutPred_score MutPred_rankscore MVP_score MVP_rankscore MPC_score MPC_rankscore PrimateAI_score PrimateAI_rankscore PrimateAI_pred DEOGEN2_score DEOGEN2_rankscore DEOGEN2_pred BayesDel_addAF_score BayesDel_addAF_rankscore BayesDel_addAF_pred BayesDel_noAF_score BayesDel_noAF_rankscore BayesDel_noAF_pred ClinPred_score ClinPred_rankscore ClinPred_pred LIST-S2_score LIST-S2_rankscore LIST-S2_pred Aloft_pred Aloft_Confidence CADD_raw CADD_raw_rankscore CADD_phred DANN_score DANN_rankscore fathmm-MKL_coding_score fathmm-MKL_coding_rankscore fathmm-MKL_coding_pred fathmm-XF_coding_score fathmm-XF_coding_rankscore fathmm-XF_coding_pred Eigen-raw_coding Eigen-raw_coding_rankscore Eigen-PC-raw_coding Eigen-PC-raw_coding_rankscore GenoCanyon_score GenoCanyon_rankscore integrated_fitCons_score integrated_fitCons_rankscore integrated_confidence_value LINSIGHT LINSIGHT_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore phyloP100way_vertebrate phyloP100way_vertebrate_rankscore phyloP30way_mammalian phyloP30way_mammalian_rankscore phastCons100way_vertebrate phastCons100way_vertebrate_rankscore phastCons30way_mammalian phastCons30way_mammalian_rankscore SiPhy_29way_logOdds SiPhy_29way_logOdds_rankscore Interpro_domain GTEx_V8_gene GTEx_V8_tissue
1 65565 65565 A C . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.317 0.491 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.166 0.191 14.46 0.703 0.092 0.567 0.302 D . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.589 0.458 1.152 0.705 0.993 0.379 1.000 0.863 4.611 0.118 . . .
1 65565 65565 A G . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.318 0.492 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.189 0.194 14.59 0.739 0.105 0.286 0.236 N . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.589 0.458 1.152 0.705 0.993 0.379 1.000 0.863 4.611 0.118 . . .
1 65565 65565 A T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.316 0.491 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.173 0.192 14.50 0.689 0.088 0.551 0.298 D . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.589 0.458 1.152 0.705 0.993 0.379 1.000 0.863 4.611 0.118 . . .
1 65566 65566 T A . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.343 0.514 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.257 0.203 14.96 0.710 0.095 0.441 0.272 N . . . . . . . 0.000 0.058 0.061 0.011 0 0.692 0.427 2.73 2.73 0.313 2.158 0.420 1.002 0.370 1.000 0.716 1.000 0.863 9.273 0.368 . . .
1 65566 65566 T C . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.346 0.516 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.273 0.206 15.05 0.782 0.122 0.594 0.309 D . . . . . . . 0.000 0.058 0.061 0.011 0 0.692 0.427 2.73 2.73 0.313 2.158 0.420 1.002 0.370 1.000 0.716 1.000 0.863 9.273 0.368 . . .
1 65566 65566 T G . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.344 0.514 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.249 0.202 14.92 0.753 0.110 0.620 0.316 D . . . . . . . 0.000 0.058 0.061 0.011 0 0.692 0.427 2.73 2.73 0.313 2.158 0.420 1.002 0.370 1.000 0.716 1.000 0.863 9.273 0.368 . . .
1 65567 65567 G A . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.350 0.519 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.142 0.188 14.32 0.541 0.051 0.575 0.304 D . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.639 0.462 1.034 0.512 1.000 0.716 0.999 0.704 7.790 0.283 . . .
1 65567 65567 G C . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.348 0.517 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.082 0.180 13.96 0.609 0.066 0.564 0.301 D . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.639 0.462 1.034 0.512 1.000 0.716 0.999 0.704 7.790 0.283 . . .
1 65567 65567 G T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 0.347 0.516 T . . . . . . . . . . . . . . . . . . . . . . . . . . 0.9 0.651 D .; .; 1.092 0.181 14.03 0.540 0.051 0.462 0.277 N . . . . . . . 0.000 0.029 0.061 0.011 0 0.207 0.308 2.73 2.73 0.313 2.639 0.462 1.034 0.512 1.000 0.716 0.999 0.704 7.790 0.283 . . .
Loading

0 comments on commit d3b4ab5

Please sign in to comment.