Skip to content

Commit

Permalink
Add transcript export file #256
Browse files Browse the repository at this point in the history
  • Loading branch information
antonylebechec committed Sep 24, 2024
1 parent d27578b commit 9d6046e
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 3 deletions.
6 changes: 5 additions & 1 deletion config/param.transcripts.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"from_column_format": [
{
"transcripts_column": "ANN",
"transcripts_infos_column": "Feature_ID"
"transcripts_infos_column": "Feature_ID",
"column_clean": true
}
],
"from_columns_map": [
Expand Down Expand Up @@ -35,6 +36,9 @@
"prioritization_config": "config/prioritization_transcripts_profiles.json",
"pzprefix": "PZT",
"prioritization_score_mode": "HOWARD"
},
"export": {
"output": "/tmp/output.transcripts.tsv"
}
}
}
134 changes: 134 additions & 0 deletions howard/objects/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2273,6 +2273,7 @@ def export_output(
query=query,
export_header=export_header,
sample_list=sample_list,
table="transcripts",
)

# Remove
Expand Down Expand Up @@ -6622,6 +6623,14 @@ def get_config_default(self, name: str) -> dict:
"function_name": "calculation_transcripts_prioritization",
"function_params": [],
},
"transcripts_export": {
"type": "python",
"name": "transcripts_export",
"description": "Export transcripts table/view as a file (using param.json)",
"available": True,
"function_name": "calculation_transcripts_export",
"function_params": [],
},
},
"prioritizations": {
"default": {
Expand Down Expand Up @@ -9612,10 +9621,135 @@ def calculation_transcripts_prioritization(self) -> None:
else:
log.info("No Transcripts to process. Check param.json file configuration")

def calculation_transcripts_export(self) -> None:
""" """

# Create transcripts table
transcripts_table = self.create_transcript_view()

# Add info field
if transcripts_table:
self.transcripts_export(transcripts_table=transcripts_table)
else:
log.info("No Transcripts to process. Check param.json file configuration")


###############
# Transcripts #
###############

def transcripts_export(
self, transcripts_table: str = None, param: dict = {}
) -> bool:
""" """

log.debug("Start transcripts export...")

# Param
if not param:
param = self.get_param()

# Param export
param_transcript_export = param.get("transcripts", {}).get("export", {})

# Output file
transcripts_export_output = param_transcript_export.get("output", None)

if not param_transcript_export or not transcripts_export_output:
log.warning(f"No transcriipts export parameters defined!")
return False

# List of transcripts annotations
query_describe = f"""
SELECT column_name
FROM (
DESCRIBE SELECT * FROM {transcripts_table}
)
WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO')
"""
transcripts_annotations_list = list(
self.get_query_to_df(query=query_describe)["column_name"]
)

# Create transcripts table for export
transcripts_table_export = f"{transcripts_table}_export_" + "".join(
random.choices(string.ascii_uppercase + string.digits, k=10)
)
query_create_transcripts_table_export = f"""
CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table})
"""
self.execute_query(query=query_create_transcripts_table_export)

# Output file format
transcripts_export_output_format = get_file_format(
filename=transcripts_export_output
)

# Format VCF - construct INFO
if transcripts_export_output_format in ["vcf"]:

# Construct query update INFO and header
query_update_info = []
for field in transcripts_annotations_list:

# If field not in header
if field not in self.get_header_infos_list():

# Add PZ Transcript in header
self.get_header().infos[field] = vcf.parser._Info(
field,
".",
"String",
f"Annotation '{field}' from transcript view",
"unknown",
"unknown",
0,
)

# Add field as INFO/tag
query_update_info.append(
f"""
CASE
WHEN "{field}" IS NOT NULL
THEN concat('{field}=', "{field}", ';')
ELSE ''
END
"""
)

# Query param
query_update_info_value = (
f""" concat('', {", ".join(query_update_info)}) """
)
query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """

else:

# Query param
query_update_info_value = f""" NULL """
query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """

# Update query INFO column
query_update = f"""
UPDATE {transcripts_table_export}
SET INFO = {query_update_info_value}
"""
self.execute_query(query=query_update)

# Export
self.export_output(
output_file=transcripts_export_output,
query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """,
)

# Drop transcripts export table
query_drop_transcripts_table_export = f"""
DROP TABLE {transcripts_table_export}
"""
self.execute_query(query=query_drop_transcripts_table_export)


def transcripts_prioritization(
self, transcripts_table: str = None, param: dict = {}
) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion howard/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1893,7 +1893,7 @@ def __call__(self, string):
""" howard calculation --input=tests/data/example.ann.vcf.gz --output=/tmp/example.calculated.tsv --calculations='snpeff_hgvs,NOMEN' --hgvs_field=snpeff_hgvs --transcripts=tests/data/transcripts.tsv \n"""
""" howard calculation --input=tests/data/example.vcf.gz --output=/tmp/example.calculated.tsv --calculations='TRIO' --trio_pedigree='sample1,sample2,sample4' \n"""
""" howard calculation --input=tests/data/example.vcf.gz --output=/tmp/example.calculated.tsv --calculations='BARCODEFAMILY' --family_pedigree='sample1,sample2,sample4' \n"""
""" howard calculation --input=tests/data/example.ann.transcripts.vcf.gz --output=/tmp/example.calculation.transcripts.tsv --param=config/param.transcripts.json --calculations='TRANSCRIPTS_ANNOTATIONS,TRANSCRIPTS_PRIORITIZATION' \n"""
""" howard calculation --input=tests/data/example.ann.transcripts.vcf.gz --output=/tmp/example.calculation.transcripts.tsv --param=config/param.transcripts.json --calculations='TRANSCRIPTS_ANNOTATIONS,TRANSCRIPTS_PRIORITIZATION,TRANSCRIPTS_EXPORT' \n"""
""" howard calculation --input=tests/data/example.ann.vcf.gz --output=/tmp/example.ann.tsv --param=config/param.json \n"""
""" howard calculation --show_calculations \n"""
""" \n""",
Expand Down
44 changes: 44 additions & 0 deletions tests/data/param.transcripts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"transcripts": {
"table": "transcripts",
"transcripts_info_field_json": "transcripts_json",
"transcripts_info_field_format": "transcripts_ann",
"struct": {
"from_column_format": [
{
"transcripts_column": "ANN",
"transcripts_infos_column": "Feature_ID",
"column_clean": true
}
],
"from_columns_map": [
{
"transcripts_column": "Ensembl_transcriptid",
"transcripts_infos_columns": [
"genename",
"Ensembl_geneid",
"LIST_S2_score",
"LIST_S2_pred"
]
},
{
"transcripts_column": "Ensembl_transcriptid",
"transcripts_infos_columns": [
"genename",
"VARITY_R_score",
"Aloft_pred"
]
}
]
},
"prioritization": {
"profiles": ["transcripts"],
"prioritization_config": "config/prioritization_transcripts_profiles.json",
"pzprefix": "PZT",
"prioritization_score_mode": "HOWARD"
},
"export": {
"output": "/tmp/output.transcripts.tsv"
}
}
}
97 changes: 96 additions & 1 deletion tests/test_variants_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from tempfile import TemporaryDirectory
import pytest
import vcf
import os

from howard.functions.commons import remove_if_exists
from howard.functions.commons import remove_if_exists, get_file_format
from howard.objects.variants import Variants
from test_needed import tests_folder, tests_config, tests_data_folder

Expand Down Expand Up @@ -1710,3 +1711,97 @@ def test_transcripts_create_view_param_mapping(
vcf.Reader(filename=output_vcf)
except:
assert False



@pytest.mark.parametrize(
"output",
[
"output.vcf",
"output.vcf.gz",
"output.tsv",
"output.tsv.gz",
"output.parquet",
"output.json",
],
)
def test_transcripts_create_view_export(output):
""" """

with TemporaryDirectory(dir=tests_folder) as tmp_dir:

tmp_dir = "/tmp"

# Init files
input_vcf = f"{tests_data_folder}/example.ann.transcripts.vcf.gz"

# Construct param dict
param_struct = {
"table": "transcripts",
"column_id": "transcript",
"transcripts_info_json": "transcripts_json",
"transcripts_info_field": "transcripts_json",
"transcript_id_remove_version": True,
"transcript_id_mapping_file": None,
"transcript_id_mapping_force": False,
"struct": {
"from_column_format": [
{
"transcripts_column": "ANN",
"transcripts_infos_column": "Feature_ID",
"column_clean": True,
},
],
"from_columns_map": [
{
"transcripts_column": "Ensembl_transcriptid",
"transcripts_infos_columns": [
"genename",
"Ensembl_geneid",
"LIST_S2_score",
"LIST_S2_pred",
],
"column_clean": False,
},
{
"transcripts_column": "Ensembl_transcriptid",
"transcripts_infos_columns": [
"genename",
"VARITY_R_score",
"Aloft_pred",
],
"column_clean": False,
},
],
},
"export": {"output": f"{tmp_dir}/{output}"},
}

# Param without prioritization
param_with_transcripts = {"transcripts": dict(param_struct)}

# Create object
variants = Variants(
conn=None, input=input_vcf, param=param_with_transcripts, load=True
)

# Create transcript view
transcripts_table = variants.create_transcript_view(
param=param_with_transcripts
)

# Export
try:
variants.transcripts_export(
transcripts_table=transcripts_table, param=param_with_transcripts
)
except:
assert False

assert os.path.isfile(f"{tmp_dir}/{output}")

if get_file_format(output) in ["vcf"]:
try:
vcf.Reader(filename=f"{tmp_dir}/{output}")
except:
assert False

0 comments on commit 9d6046e

Please sign in to comment.