Skip to content

Commit

Permalink
New options for transcripts prioritization #256
Browse files Browse the repository at this point in the history
  • Loading branch information
antonylebechec committed Sep 19, 2024
1 parent dea4173 commit 8d26b9d
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 100 deletions.
156 changes: 98 additions & 58 deletions howard/objects/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,6 @@ def get_header(self, type: str = "vcf"):
elif type == "list":
return vcf_required


def get_header_infos_list(self) -> list:
"""
This function retrieves a list of information fields from the header.
Expand All @@ -1106,7 +1105,6 @@ def get_header_infos_list(self) -> list:

return infos_list


def get_header_length(self, file: str = None) -> int:
"""
The function `get_header_length` returns the length of the header list, excluding the #CHROM
Expand Down Expand Up @@ -9750,29 +9748,10 @@ def transcripts_prioritization(
table=transcripts_table,
pz_param=param.get("transcripts", {}).get("prioritization", {}),
)

# # DEVEL
# query_devel = f"""
# SELECT * FROM transcripts LIMIT 1
# """
# res_devel = self.get_query_to_df(query=query_devel)
# log.debug(f""" res_devel transcripts ={res_devel.to_string()} """)
# query_devel = f"""
# SELECT INFO FROM variants LIMIT 1
# """
# res_devel = self.get_query_to_df(query=query_devel)
# log.debug(f""" res_devel variants ={res_devel.to_string()} """)

if not prioritization_result:
log.warning("Transcripts prioritization not processed")
return False

# # Explode PZ fields
# self.explode_infos(
# table=transcripts_table,
# fields=list(pz_param_pzfields.keys()) + pz_mandatory_fields,
# )

# PZ fields sql query
query_update_select_list = []
query_update_concat_list = []
Expand All @@ -9795,70 +9774,131 @@ def transcripts_prioritization(

# Order by
pz_orders = (
param.get("transcripts", {}).get("prioritization", {}).get("pzorder", {})
param.get("transcripts", {})
.get("prioritization", {})
.get("prioritization_transcripts_order", {})
)
if not pz_orders:
pz_orders = {
pz_param.get("pzprefix", "PTZ") + "Flag": "ASC",
pz_param.get("pzprefix", "PTZ") + "Score": "DESC",
"transcript": "ASC",
}
for pz_order in pz_orders:
query_update_order_list.append(
f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """
)

# Fields to explode
fields_to_explode = list(pz_param_pzfields.keys()) + pz_mandatory_fields + list(pz_orders.keys())
fields_to_explode.remove("transcript")
fields_to_explode = (
list(pz_param_pzfields.keys())
+ pz_mandatory_fields
+ list(pz_orders.keys())
)
# Remove transcript column as a specific transcript column
if "transcript" in fields_to_explode:
fields_to_explode.remove("transcript")

# Check fields to explode
for field_to_explode in fields_to_explode:
if field_to_explode not in self.get_header_infos_list():
msg_err = f"INFO/{field_to_explode} NOT IN header"
log.error(msg_err)
raise ValueError(msg_err)
# eExplode fields to explode
exploded_fields = self.explode_infos(

# Explode fields to explode
self.explode_infos(
table=transcripts_table,
fields=fields_to_explode,
)

# # DEVEL
# query_devel = f"""
# DESCRIBE

# {transcripts_table}
# """
# log.debug(f""" query_devel={query_devel} """)
# df_devel = self.get_query_to_df(query=query_devel)
# log.debug(df_devel)
# query_devel = f"""
# SELECT
# "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
# ROW_NUMBER() OVER (
# PARTITION BY "#CHROM", POS, REF, ALT
# ORDER BY {" , ".join(query_update_order_list)}
# ) AS rn
# FROM
# {transcripts_table}
# """
# log.debug(f""" query_devel={query_devel} """)
# df_devel = self.get_query_to_df(query=query_devel)
# log.debug(df_devel)

# Export Transcripts prioritization infos to variants table
query_update = f"""
WITH RankedTranscripts AS (
# Transcript preference file
transcripts_preference_file = (
param.get("transcripts", {})
.get("prioritization", {})
.get("prioritization_transcripts", {})
)
transcripts_preference_file = full_path(transcripts_preference_file)

# Transcript preference forced
transcript_preference_force = (
param.get("transcripts", {})
.get("prioritization", {})
.get("prioritization_transcripts_force", False)
)
# Transcript version forced
transcript_version_force = (
param.get("transcripts", {})
.get("prioritization", {})
.get("prioritization_transcripts_version_force", False)
)

# Transcripts Ranking
if transcripts_preference_file:

# Transcripts file to dataframe
if os.path.exists(transcripts_preference_file):
transcripts_preference_dataframe = transcripts_file_to_df(
transcripts_preference_file
)
else:
log.error(
f"Transcript file '{transcripts_preference_file}' does NOT exist"
)
raise ValueError(
f"Transcript file '{transcripts_preference_file}' does NOT exist"
)

# Order by depending to transcript preference forcing
if transcript_preference_force:
order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """
else:
order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """

# Transcript columns joined depend on version consideration
if transcript_version_force:
transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """
else:
transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """

# Query ranking for update
query_update_ranking = f"""
SELECT
"#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)}
ROW_NUMBER() OVER (
PARTITION BY "#CHROM", POS, REF, ALT
ORDER BY {order_by}
) AS rn
FROM {transcripts_table}
LEFT JOIN
(
SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order
FROM transcripts_preference_dataframe
) AS transcripts_preference
ON {transcripts_version_join}
"""

else:

# Query ranking for update
query_update_ranking = f"""
SELECT
"#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)}
ROW_NUMBER() OVER (
PARTITION BY "#CHROM", POS, REF, ALT
ORDER BY {" , ".join(query_update_order_list)}
) AS rn
FROM
{transcripts_table}
FROM {transcripts_table}
"""

# DEBUG
# log.debug(f""" query_update_ranking={query_update_ranking} """)
# df_devel = self.get_query_to_df(query=query_update_ranking)
# log.debug(df_devel)

# Export Transcripts prioritization infos to variants table
query_update = f"""
WITH RankedTranscripts AS (
{query_update_ranking}
)
UPDATE {table_variants}
SET
Expand All @@ -9876,9 +9916,9 @@ def transcripts_prioritization(
AND variants."#CHROM" = RankedTranscripts."#CHROM"
AND variants."POS" = RankedTranscripts."POS"
AND variants."REF" = RankedTranscripts."REF"
AND variants."ALT" = RankedTranscripts."ALT"
AND variants."ALT" = RankedTranscripts."ALT"
"""

# log.debug(f"query_update={query_update}")
self.execute_query(query=query_update)

Expand Down
54 changes: 54 additions & 0 deletions tests/data/prioritization_transcripts_profiles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"transcripts": {
"LIST_S2_score": [
{
"type": "gt",
"value": "0.75",
"score": 10,
"flag": "PASS",
"comment": ["Very Good LIST Score"]
},
{
"type": "gt",
"value": "0.50",
"score": 10,
"flag": "PASS",
"comment": ["Good LIST Score"]
}
],
"CLNSIG": [
{
"type": "eq",
"value": "pathogenic",
"score": 100,
"flag": "PASS",
"comment": ["Pathogenic"]
}
],
"AnnotationImpact": [
{
"type": "eq",
"value": "MODIFIER",
"score": 100,
"flag": "PASS",
"comment": ["MODIFIER"]
}
],
"transcript": [
{
"type": "eq",
"value": "NM_005228.5",
"score": 100,
"flag": "PASS",
"comment": ["NM_005228.5"]
},
{
"type": "eq",
"value": "NM_001346941.2",
"score": 100,
"flag": "PASS",
"comment": ["NM_001346941.2"]
}
]
}
}
6 changes: 6 additions & 0 deletions tests/data/transcripts.for_prioritization.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
NR_024540 WASH7P
NR_036266 MIR1302-9
NM_001346900 EGFR
NM_001346897 EGFR
NR_047551 EGFR
NM_001346941.2 EGFR
Loading

0 comments on commit 8d26b9d

Please sign in to comment.