Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cz/fix issue#32 #38

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cluster_transform/code/py_code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ def get_cluster_pyclone_vi(cluster_file, tsv_files, alpha):
# We need ast.literal_eval(x).decode("utf-8") since the csv file output by pyclone-vi
# contains literal "b'xxx_id'" in the csv file, we only want the string "xxx_id" inside.
# May change this to a better way in the future.
df_clusters["mutation_id"] = df_clusters["mutation_id"].apply(lambda x: ast.literal_eval(x).decode("utf-8"))
df_clusters["sample_id"] = df_clusters["sample_id"].apply(lambda x: ast.literal_eval(x).decode("utf-8"))
if df_clusters["mutation_id"][0].startswith("b'"):
df_clusters["mutation_id"] = df_clusters["mutation_id"].apply(lambda x: ast.literal_eval(x).decode("utf-8"))
df_clusters["sample_id"] = df_clusters["sample_id"].apply(lambda x: ast.literal_eval(x).decode("utf-8"))
df_input = pd.read_csv(tsv_files, sep='\t').set_index("mutation_id")
df_input["VAF"] = df_input["alt_counts"] / (df_input["ref_counts"] + df_input["alt_counts"])
list_clustered = []
Expand Down
4 changes: 2 additions & 2 deletions physigs/code/physigs_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
echo "physigs_entrypoint.sh: begin."
echo "physigs_entrypoint.sh: list of input args from wdl runtime: $@"
if [ $# -eq 3 ]; then
Rscript run_physigs.R $1 $2 $3
Rscript run_physigs.R --tree $1 --snv $2 --outprefix $3
elif [ $# -eq 4 ]; then
Rscript run_physigs.R $1 $2 $3 $4
Rscript run_physigs.R --tree $1 --snv $2 --outprefix $3 --signature $4
else
echo "physigs_entrypoint.sh: wrong number of input args, only 3 or 4 allowed."
fi
Expand Down
107 changes: 61 additions & 46 deletions physigs/code/run_physigs.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,53 +2,64 @@ library(PhySigs, warn.conflicts = FALSE)
library(graph, warn.conflicts = FALSE)
library(Rgraphviz, warn.conflicts = FALSE)
library(RColorBrewer, warn.conflicts = FALSE)

args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 3 || length(args) == 4) {
tree_file = args[1]
snv_file = args[2]
o_pdf_file = base::paste(args[3], "plot.pdf", sep='.')
o_tree_file = base::paste(args[3], "tree.tsv", sep='.')
o_exposure_file = base::paste(args[3], "exposure.tsv", sep='.')
if (length(args) == 4) {
S = readLines(args[4])
} else {
S <- c(
"Signature.1",
"Signature.2",
"Signature.3",
"Signature.4",
"Signature.5",
"Signature.6",
"Signature.7",
"Signature.8",
"Signature.9",
"Signature.10",
"Signature.11",
"Signature.12",
"Signature.13",
"Signature.14",
"Signature.15",
"Signature.16",
"Signature.17",
"Signature.18",
"Signature.19",
"Signature.20",
"Signature.21",
"Signature.22",
"Signature.23",
"Signature.24",
"Signature.25",
"Signature.26",
"Signature.27",
"Signature.28",
"Signature.29",
"Signature.30"
)
}
library(optparse, warn.conflicts = FALSE)

option_list <- list(
make_option(c("--tree"), action = "store", type = "character",
default = NULL, help = "tree CSV file"),
make_option(c("--snv"), action = "store", type = "character",
default = NULL, help = "SNV CSV file"),
make_option(c("--outprefix"), action = "store", type = "character",
default = NULL, help = "output file name prefix"),
make_option(c("--signature"), action = "store", type = "character",
default = NULL, help = "selected signatures file, each line contains a signature name"),
make_option(c("--hg38"), action = "store_true",
default = FALSE,
help = "set flag to use reference hg38, [default FALSE]")
)

opt <- parse_args(OptionParser(option_list = option_list))

tree_file = opt$tree
snv_file = opt$snv
o_pdf_file = base::paste(opt$outprefix, "plot.pdf", sep='.')
o_tree_file = base::paste(opt$outprefix, "tree.tsv", sep='.')
o_exposure_file = base::paste(opt$outprefix, "exposure.tsv", sep='.')
if (!is.null(opt$signature)) {
S = readLines(opt$signature)
} else {
cat("Usage: run_physigs.R TREE SNV OUTPUT_PREFIX [SIGNATURES]\nOutput: OUTPUT_PREFIX.plot.pdf, \n")
q(status=1)
S <- c(
"Signature.1",
"Signature.2",
"Signature.3",
"Signature.4",
"Signature.5",
"Signature.6",
"Signature.7",
"Signature.8",
"Signature.9",
"Signature.10",
"Signature.11",
"Signature.12",
"Signature.13",
"Signature.14",
"Signature.15",
"Signature.16",
"Signature.17",
"Signature.18",
"Signature.19",
"Signature.20",
"Signature.21",
"Signature.22",
"Signature.23",
"Signature.24",
"Signature.25",
"Signature.26",
"Signature.27",
"Signature.28",
"Signature.29",
"Signature.30"
)
}

# Input CSV file
Expand All @@ -74,13 +85,17 @@ for (i in 1:nrow(tree_matrix)){
# snv_file <- system.file("extdata", "snv.csv", package = "PhySigs", mustWork = TRUE)
input_mat <- as.data.frame(read.csv(file=snv_file,
colClasses = c("character", "character", "numeric", "character", "character")))
if (!startsWith(input_mat$chr, "chr")) {
input_mat$chr <- paste("chr", input_mat$chr, sep='')
}

# Use deconstructSigs to convert SNVs to 96 Features
P <- deconstructSigs::mut.to.sigs.input(mut.ref = input_mat,
sample.id = "Sample",
chr = "chr",
pos = "pos",
ref = "ref",
# bsg = BSgenome.Hsapiens.UCSC.hg38,
alt = "alt")

# Normalize feature matrix
Expand Down
2 changes: 2 additions & 0 deletions physigs/docker/Dockerfile-physigs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ RUN R -e "BiocManager::install(c('graph'))"
RUN R -e "BiocManager::install(c('BSgenome'))"
RUN R -e "BiocManager::install(c('GenomeInfoDb'))"
RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg19'))"
RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38'))"
RUN R -e "BiocManager::install(c('Rgraphviz'))"
RUN R -e "devtools::install_github('elkebir-group/PhySigs_R')"
RUN R -e "install.packages('optparse')"

COPY ./code /code

Expand Down
4 changes: 2 additions & 2 deletions physigs/physigs-task.wdl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
task physigs{
input {
File tree_csv
File snv_csv
File snvs_csv
File? signatures
}

command {
pwd
out_dir=$(pwd)
cd /code
sh physigs_entrypoint.sh ${tree_csv} ${snv_csv} $out_dir/physigs_tree \
sh physigs_entrypoint.sh ${tree_csv} ${snvs_csv} $out_dir/physigs_tree \
${if defined(signatures) then signatures else "" }
pwd
ls -al
Expand Down
2 changes: 1 addition & 1 deletion run_physigs_example1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ PHYSIGS_SIGNATURES=./example_data/physigs/signatures.txt

miniwdl run --dir=runs/ physigs/physigs-task.wdl \
tree_csv=$PHYSIGS_TREE \
snv_csv=$PHYSIGS_SNV \
snvs_csv=$PHYSIGS_SNV \
signatures=$PHYSIGS_SIGNATURES

9 changes: 9 additions & 0 deletions run_workflow_vcf_to_trees.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# eg.
# > bash run_workflow_vcf_to_trees.sh

# Run . Will produce

FILE=./example_data/mek_lab_vcfs/moss.mutect2.filtered.vcf
miniwdl run --dir=runs/ workflows/vcf_to_trees.wdl vcf_type=moss vcf_input_file=$FILE alpha=1
3 changes: 3 additions & 0 deletions spruce/code/spruce_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@ $build_dir/enumerate -clique $CLIQUE -t 2 -v 3 $DATA > spruce.res
gzip -c spruce.res > spruce.res.gz
echo "SPRUCE: Running rank"
zcat spruce.res.gz | $build_dir/rank - > spruce.merged.res
echo "SPRUCE: Visualize"
zcat spruce.res.gz | $build_dir/visualize -i 0 -a - > spruce.res.txt
zcat spruce.res.gz | $build_dir/visualize -i 0 -j - > spruce.res.json

echo "spruce_entrypoint.sh finished."
2 changes: 2 additions & 0 deletions spruce/spruce-task.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ task spruce_phylogeny {
File cliques = "spruce.cliques"
File result = "spruce.res.gz"
File rank_result = "spruce.merged.res"
File tree_text = "spruce.res.txt"
File tree_json = "spruce.res.json"
}

runtime {
Expand Down
Empty file.
66 changes: 66 additions & 0 deletions spruce_to_physigs/code/py_code/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
import argparse
import csv
import json


def main(args):
success = False

convert_tsv(args.tsv, args.output)
convert_tree(args.json, args.output)

success = True
return success


def convert_tsv(tsv_file, output_prefix):
with open(tsv_file, 'r') as ifile, open(f"{output_prefix}.snv.csv", 'w') as ofile:
print(f"{output_prefix}.snv.csv")
ofile.write("Sample,chr,pos,ref,alt\n")
reader = csv.DictReader(ifile, delimiter='\t')
for row in reader:
sample = row["cluster_id"]
chrom, pos, ref_alt = row["mutation_id"].split(':')
ref, alt = ref_alt.split('>')
alt = alt[1:-1]
ofile.write(f"{sample},{chrom},{pos},{ref},{alt}\n")


def convert_tree(json_file, output_prefix):
with open(json_file, 'r') as ifile, open(f"{output_prefix}.tree.csv", 'w') as ofile:
print(f"{output_prefix}.tree.csv")
tree = json.load(ifile)
nodes = tree["nodes"]
id2label = {}
for node in nodes:
id2label[node["id"]] = node["label"][1:-1].split(',')[0]
edges = tree["sol_0"] # FIXME
edge_list = []
for edge in edges:
s = id2label[edge["source"]]
t = id2label[edge["target"]]
if s != '*' and t != '*':
edge_list.append([s, t])
ofile.write("V1,V2\n")
ofile.write('\n'.join(f"{edge[0]},{edge[1]}" for edge in edge_list))


if __name__ == "__main__":
parser = argparse.ArgumentParser("Transform SPRUCE output to Physigs input")
parser.add_argument("-t", "--tsv", type=str,
help="pyclone-vi's output assignment TSV file, contaning variants")
parser.add_argument("-j", "--json", type=str,
help="SPRUCE's output tree json file")
parser.add_argument("-o", "--output", type=str,
help="output file prefix for Physigs")
args = parser.parse_args(None if sys.argv[1:] else ['-h'])

succeeded = main(args)

if succeeded:
exit_code = 0
else:
exit_code = 1

sys.exit(exit_code)
16 changes: 16 additions & 0 deletions spruce_to_physigs/code/spruce_to_physigs_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## This is the script that runs *inside* the container and receives the arguments
## from the wdl runtime to pass to the spruce_to_physigs python code

# This script expects 3 args
# $1 tsv: path to a TSV file containing variants produced by the pipeline 'vcf_transform'
# $2 json: path to a JSON file containing tree edges produced by the pipeline 'spruce'
# $3 prefix: output prefix
# It returns 2 CSV files
# tree.csv
# snvs.csv


echo "spruce_to_physigs.sh: begin."
echo "spruce_to_physigs.sh: list of input args from wdl runtime: $@"
python -B -m py_code.main -t $1 -j $2 -o $3
echo "spruce_to_physigs.sh: finished."
35 changes: 35 additions & 0 deletions spruce_to_physigs/docker/Dockerfile-spruce-to-physigs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM python:3.7

ENV HOME=/home/dockeruser
RUN mkdir /home/dockeruser && \
touch /home/dockeruser/.bashrc && \
chmod -R 777 /home/dockeruser

ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
ENV PATH /opt/conda/bin:$PATH

RUN apt-get -qq update && \
apt-get -qq install --no-install-recommends \
iproute2 \
sudo \
lxc \
libltdl7 \
ssh \
vim \
gnupg2 \
curl \
wget \
bzip2 \
ca-certificates \
python3 \
python3-dev \
python3-pip

# copy the phyloflow/spruce_to_physigs/code directory into the container.
# NOTE this says the local code is in '.' b/c the build_spruce_to_physigs_container.sh script
# uses '..' as the context for the build process. Necessary to avoid this issue:
# https://stackoverflow.com/questions/27068596/how-to-include-files-outside-of-dockers-build-context
COPY ./code /code

RUN ls -al /
RUN ls -al /code/
3 changes: 3 additions & 0 deletions spruce_to_physigs/docker/build_spruce_to_physigs_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
echo "Begin build_spruce_to_physigs_container.sh"
docker build --file=Dockerfile-spruce-to-physigs --tag=phyloflow/spruce-to-physigs:latest ..
echo "Finished build_spruce_to_physigs_container.sh"
32 changes: 32 additions & 0 deletions spruce_to_physigs/spruce-to-physigs-task.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
task spruce_to_physigs_transform{
input {
File tsv_variants
File json_edge_list
}

command {
pwd
mkdir pyclone_samples
out_dir=$(pwd)
cd /code
sh spruce_to_physigs_entrypoint.sh ${tsv_variants} ${json_edge_list} $out_dir/physigs
pwd
ls -al
cd ..
ls -al /
ls /mnt
}

output {
File response = stdout()
File err_response = stderr()

File tree_csv = "physigs.tree.csv"
File snvs_csv = "physigs.snv.csv"
}

runtime {
docker: 'phyloflow/spruce-to-physigs:latest'
}
}

3 changes: 2 additions & 1 deletion vcf_transform/code/py_code/mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,9 @@ def _construct_mutation_id(vcf_record:vcf.model._Record) -> str:
chrom = str(vcf_record.CHROM)
pos = str(vcf_record.POS)
ref = str(vcf_record.REF)
alt = str(vcf_record.ALT)
#mid = (chrom + ":" + pos + ":" + ref).decode('utf-8')
mid = (chrom + ":" + pos)
mid = (chrom + ":" + pos + ":" + ref + ">" + alt)
return mid


Expand Down
2 changes: 2 additions & 0 deletions workflows/clusters_to_trees.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ workflow clusters_to_trees{
File cliques = step2.cliques
File result = step2.result
File rank_result = step2.rank_result
File tree_text = step2.tree_text
File tree_json = step2.tree_json
}

}
Expand Down
Loading