Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vcf fix2 #481

Merged
merged 5 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- fix bug when using parameter "whitelist" [#466](https://github.com/nf-core/rnafusion/pull/466)
- fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report [#458](https://github.com/nf-core/rnafusion/pull/458)
- fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not [#465](https://github.com/nf-core/rnafusion/pull/465)
- fix VCF_COLLECT bug [#481](https://github.com/nf-core/rnafusion/pull/481)

### Removed

Expand Down
178 changes: 131 additions & 47 deletions bin/vcf_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,42 +47,61 @@ def vcf_collect(
df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()]

df_not_symbol = hgnc_df.merge(
df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
df_not_symbol,
how="right",
left_on="ensembl_gene_id",
right_on="Left_ensembl_gene_id",
)
df_symbol = hgnc_df.merge(
df_symbol, how="right", left_on="symbol", right_on="GeneA"
)
df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneA")
df = pd.concat([df_not_symbol, df_symbol])
df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})

df_symbol = df[df["Right_ensembl_gene_id"].isna()]
df_not_symbol = df[df["Right_ensembl_gene_id"].notna()]

df_not_symbol = hgnc_df.merge(
df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id"
df_not_symbol,
how="right",
left_on="ensembl_gene_id",
right_on="Right_ensembl_gene_id",
)
df_symbol = hgnc_df.merge(
df_symbol, how="right", left_on="symbol", right_on="GeneB"
)
df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneB")
df = pd.concat([df_not_symbol, df_symbol])
df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})

gtf_df = build_gtf_dataframe(gtf)
all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
all_df = df.merge(
gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id"
)
all_df[["PosA", "orig_start", "orig_end"]] = (
all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
)

all_df = all_df[
((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"]))
(
(all_df["PosA"] >= all_df["orig_start"])
& (all_df["PosA"] <= all_df["orig_end"])
)
| ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
]

all_df.replace("", np.nan, inplace=True)
all_df = all_df.drop_duplicates()

all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
all_df[["exon_number", "transcript_version"]] = all_df[
["exon_number", "transcript_version"]
].replace(0, np.nan)
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill")
)
all_df["transcript_version"] = all_df.groupby("PosA")["transcript_version"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill")
)
all_df["transcript_version"] = all_df.groupby("PosA")[
"transcript_version"
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))

all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
Expand Down Expand Up @@ -115,25 +134,36 @@ def vcf_collect(
]
].drop_duplicates()
all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str")
all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
all_df = all_df.merge(
gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id"
)
all_df[["PosB", "orig_start", "orig_end"]] = all_df[
["PosB", "orig_start", "orig_end"]
].fillna(0)
all_df[["PosB", "orig_start", "orig_end"]] = all_df[
["PosB", "orig_start", "orig_end"]
].astype(int)
all_df = all_df[
((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"]))
(
(all_df["PosB"] >= all_df["orig_start"])
& (all_df["PosB"] <= all_df["orig_end"])
)
| ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
]

all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan)
all_df = all_df.replace("", np.nan)

all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
all_df[["exon_number", "transcript_version"]] = all_df[
["exon_number", "transcript_version"]
].replace(0, np.nan)
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill")
)
all_df["transcript_version"] = all_df.groupby("PosB")["transcript_version"].transform(
lambda x: x.fillna(method="ffill").fillna(method="bfill")
)
all_df["transcript_version"] = all_df.groupby("PosB")[
"transcript_version"
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))

all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
Expand Down Expand Up @@ -212,7 +242,9 @@ def parse_args(argv=None):
type=Path,
help="HGNC database.",
)
parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample")
parser.add_argument(
"--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample"
)
parser.add_argument(
"--out",
metavar="OUT",
Expand Down Expand Up @@ -273,14 +305,28 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
df = pd.read_csv(file, sep="\t")
df = df.rename(columns={"#FusionName": "FUSION"})
if not (df.empty):
df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True)
df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(
":", expand=True
)
df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(
":", expand=True
)
df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split(
"^", expand=True
)
df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split(
"^", expand=True
)
df["annots"] = (
df["annots"]
.apply(convert_to_list)
.apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "")
.apply(
lambda x: (
",".join(map(str, x))
if isinstance(x, list)
else str(x) if pd.notna(x) else ""
)
)
)
else:
for i in [
Expand All @@ -304,7 +350,9 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
return df.set_index(["FUSION"])


def replace_value_with_column_name(row: pd.Series, value_to_replace: str, column_name: str) -> str:
def replace_value_with_column_name(
row: pd.Series, value_to_replace: str, column_name: str
) -> str:
"""
Replace a specific value in a row with the corresponding column name.
"""
Expand Down Expand Up @@ -334,9 +382,12 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
Make all column headers uppercase.
"""
with open(fusionreport_file) as f:
from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
expression = ast.literal_eval(from_html[0].split('], "tool')[0])
fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()})
from_html = [
line.split('rows": ')[1] for line in f if 'name="fusion_list' in line
]
tmp = str(from_html)[2:]
tmp2 = tmp.split(', "tools": ')[0]
fusion_report = pd.DataFrame(ast.literal_eval(tmp2))
if not "arriba" in fusion_report.columns:
fusion_report["arriba"] = ""
if not "fusioncatcher" in fusion_report.columns:
Expand All @@ -352,25 +403,31 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
fusion_report["starfusion"] = fusion_report[["starfusion"]].apply(
replace_value_with_column_name, args=("true", "starfusion"), axis=1
)
fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply(
concatenate_columns, axis=1
)
fusion_report["FOUND_IN"] = fusion_report[
["arriba", "starfusion", "fusioncatcher"]
].apply(concatenate_columns, axis=1)
fusion_report.columns = fusion_report.columns.str.upper()
fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x))
fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)

return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
["FUSION"]
fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(
lambda x: ",".join(x) if len(x) > 0 else ""
)
fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split(
"--", expand=True
)

return fusion_report[
["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]
].set_index(["FUSION"])


def read_fusionreport_csv(file: str) -> pd.DataFrame:
df = pd.read_csv(file)
columns_to_iterate = ["starfusion", "arriba", "fusioncatcher"]
for column in columns_to_iterate:
if column not in df.columns:
df[column] = ""
df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str")
df[["starfusion", "arriba", "fusioncatcher"]] = df[
["starfusion", "arriba", "fusioncatcher"]
].astype("str")
for index, row in df.iterrows():
for column in columns_to_iterate:
cell_value = row[column]
Expand Down Expand Up @@ -398,7 +455,18 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame:
df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True)
df = df.set_index("Fusion")
df.to_csv("tmp.csv")
return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]]
return df[
[
"GeneA",
"GeneB",
"ChromosomeA",
"PosA",
"StrandA",
"ChromosomeB",
"PosB",
"StrandB",
]
]


def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -424,8 +492,12 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str)
df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str)
df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str)
df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str)
df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str)
df["Left_transcript_version"] = (
df["Left_transcript_version"].fillna(0).astype(int).astype(str)
)
df["Right_transcript_version"] = (
df["Right_transcript_version"].fillna(0).astype(int).astype(str)
)
df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str)
df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str)
df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan")
Expand All @@ -452,7 +524,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};"
f"ANNOTATIONS={row['annots']}"
)
df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
df.loc[index, "Sample"] = (
f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
)

return df

Expand All @@ -474,7 +548,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
"FORMAT",
"Sample",
]
].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
].to_csv(
path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE
)

with open(out_file, "r+") as f:
content = f.read()
Expand All @@ -496,9 +572,15 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
Build a DataFrame from GTF file converted in TSV, extracting relevant columns.
"""
df = pd.read_csv(file, sep="\t")
df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True)
df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True)
return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]]
df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split(
"^", expand=True
)
df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df[
"orig_coord_info"
].str.split(",", expand=True)
return df[
["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]
]


def main(argv=None):
Expand All @@ -511,7 +593,9 @@ def main(argv=None):
or not args.fusionreport_csv
or not args.hgnc
):
logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
logger.error(
f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!"
)
sys.exit(2)
vcf_collect(
args.fusioninspector,
Expand Down
Loading