Skip to content

Commit

Permalink
Merge pull request #277 from bioinfo-chru-strasbourg/transcript_bubbl…
Browse files Browse the repository at this point in the history
…e_view

Add trancript mapping and filter, and manage version #256
  • Loading branch information
antonylebechec authored Sep 23, 2024
2 parents 7ee30f9 + 9471f5f commit d27578b
Show file tree
Hide file tree
Showing 4 changed files with 630 additions and 17 deletions.
24 changes: 15 additions & 9 deletions howard/functions/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3665,25 +3665,28 @@ def get_random(N: int = 10) -> str:
return "".join(random.choices(string.ascii_uppercase + string.digits, k=N))


def transcripts_file_to_df(transcripts_file: str) -> pd.DataFrame:
def transcripts_file_to_df(transcripts_file: str, column_names: list = ["transcript", "gene"]) -> pd.DataFrame:
"""
This function reads a transcripts file into a pandas DataFrame, filtering out comment lines.
This Python function reads a transcripts file into a pandas DataFrame, filtering out comment lines.
:param transcripts_file: The `transcripts_file` parameter is a string that represents the file path
to a file containing transcript information. This function is designed to read the contents of this
file and convert it into a pandas DataFrame. The file is expected to be tab-separated with two
columns: "transcript" and "gene
:type transcripts_file: str
:return: The function `transcripts_file_to_df` returns a pandas DataFrame containing transcript and
gene information read from a specified file. The function processes the file by filtering out
comment lines and then returns the resulting DataFrame.
:param column_names: The `column_names` parameter is a list that specifies the column names expected
in the transcripts file. By default, it is set to `["transcript", "gene"]`, indicating that the file
should have two columns named "transcript" and "gene". If the actual column names in the
:type column_names: list
:return: A pandas DataFrame containing transcript and gene information read from the specified file
after filtering out comment lines is being returned.
"""

# Full path
transcripts_file = full_path(transcripts_file)

# Transcript dataframe
transcripts_dataframe = pd.DataFrame(columns=["transcript", "gene"])
transcripts_dataframe = pd.DataFrame(columns=column_names)

# If file exists
if transcripts_file and os.path.exists(transcripts_file):
Expand All @@ -3693,12 +3696,15 @@ def transcripts_file_to_df(transcripts_file: str) -> pd.DataFrame:
transcripts_file,
sep="\t",
header=None,
names=["transcript", "gene"],
names=column_names,
)

# First column
first_column = column_names[0]

# Filter comment on lines
transcripts_dataframe = transcripts_dataframe[
transcripts_dataframe["transcript"].str.contains("^[^#]+")
transcripts_dataframe[first_column].str.contains("^[^#]+")
]

# Return
Expand Down
155 changes: 147 additions & 8 deletions howard/objects/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10281,6 +10281,21 @@ def create_transcript_view(
# Struct
struct = param.get("transcripts", {}).get("struct", None)

# Transcript veresion
transcript_id_remove_version = param.get("transcripts", {}).get(
"transcript_id_remove_version", False
)

# Transcripts mapping
transcript_id_mapping_file = param.get("transcripts", {}).get(
"transcript_id_mapping_file", None
)

# Transcripts mapping
transcript_id_mapping_force = param.get("transcripts", {}).get(
"transcript_id_mapping_force", None
)

if struct:

# Transcripts table
Expand Down Expand Up @@ -10347,19 +10362,143 @@ def create_transcript_view(
UNION BY NAME SELECT * FROM {temporary_table}
"""

# transcript table tmp
transcript_table_tmp = "transcripts_tmp"
transcript_table_tmp2 = "transcripts_tmp2"
transcript_table_tmp3 = "transcripts_tmp3"

# Merge on transcript
query_merge_on_transcripts_annotation_fields = []

# Add transcript list
query_merge_on_transcripts_annotation_fields.append(
f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """
)

# Aggregate all annotations fields
for annotation_field in set(annotation_fields):
query_merge_on_transcripts_annotation_fields.append(
f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """
)
# Query for transcripts view
query_merge_on_transcripts = f"""
SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
FROM ({query_merge})
GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
"""

# Transcripts mapping
if transcript_id_mapping_file:

# Transcript dataframe
transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe"
transcript_id_mapping_dataframe = transcripts_file_to_df(
transcript_id_mapping_file, column_names=["transcript", "alias"]
)

# Transcript version remove
if transcript_id_remove_version:
query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped"
query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)"
query_left_join = f"""
LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
"""
else:
query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped"
query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript"
query_left_join = f"""
LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1))
"""

# Transcript column for group by merge
query_transcript_merge_group_by = """
CASE
WHEN transcript_mapped NOT IN ('')
THEN split_part(transcript_mapped, '.', 1)
ELSE split_part(transcript_original, '.', 1)
END
"""

# Merge query
transcripts_tmp2_query = f"""
SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)}
FROM ({query_merge}) AS {transcript_table_tmp}
{query_left_join}
GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by}
"""

# Retrive columns after mege
transcripts_tmp2_describe_query = f"""
DESCRIBE {transcripts_tmp2_query}
"""
transcripts_tmp2_describe_list = list(
self.get_query_to_df(query=transcripts_tmp2_describe_query)[
"column_name"
]
)

# Create list of columns for select clause
transcripts_tmp2_describe_select_clause = []
for field in transcripts_tmp2_describe_list:
if field not in [
"#CHROM",
"POS",
"REF",
"ALT",
"INFO",
"transcript_mapped",
]:
as_field = field
if field in ["transcript_original"]:
as_field = "transcripts_mapped"
transcripts_tmp2_describe_select_clause.append(
f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """
)

# Merge with mapping
query_merge_on_transcripts = f"""
SELECT
"#CHROM", POS, REF, ALT, INFO,
CASE
WHEN ANY_VALUE(transcript_mapped) NOT IN ('')
THEN ANY_VALUE(transcript_mapped)
ELSE ANY_VALUE(transcript_original)
END AS transcript,
{", ".join(transcripts_tmp2_describe_select_clause)}
FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2}
GROUP BY "#CHROM", POS, REF, ALT, INFO,
{query_transcript_merge_group_by}
"""

# Add transcript filter from mapping file
if transcript_id_mapping_force:
query_merge_on_transcripts = f"""
SELECT *
FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3}
WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe)
"""

# No transcript mapping
else:

# Remove transcript version
if transcript_id_remove_version:
query_transcript_column = f"""
split_part({transcript_table_tmp}.transcript, '.', 1)
"""
else:
query_transcript_column = """
transcript
"""

# Query sections
query_transcript_column_select = (
f"{query_transcript_column} AS transcript"
)
query_transcript_column_group_by = query_transcript_column

# Query for transcripts view
query_merge_on_transcripts = f"""
SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)}
FROM ({query_merge}) AS {transcript_table_tmp}
GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column}
"""

log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}")

# Drop transcript view is necessary
if transcripts_table_drop:
Expand Down Expand Up @@ -10724,7 +10863,7 @@ def transcript_view_to_variants(
clause_to_format = []
for field in transcripts_infos_columns:
clause_select.append(
f""" regexp_split_to_table("{field}", ',') AS '{field}' """
f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """
)
clause_to_json.append(f""" '{field}': "{field}" """)
clause_to_format.append(f""" "{field}" """)
Expand Down
8 changes: 8 additions & 0 deletions tests/data/transcripts.for_mapping.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
NR_024540
NR_036266
NM_001346900
NM_001346897
NR_047551
NM_001346941.2
NM_005228
NM_001005484 ENST00000641515.1
Loading

0 comments on commit d27578b

Please sign in to comment.