-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeneactivityscores.wdl
137 lines (117 loc) · 6.09 KB
/
geneactivityscores.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
version 1.0
workflow get_scores {
call run_get_gene_activity
output {
File get_scores_output = run_get_gene_activity.scores_object
}
}
task run_get_gene_activity {
input {
String output_dir # gbucket (no / at end)
File DARs_file # DARs.pkl
File cistopic_file # cistopic_obj_plus_model.pkl
File imputed_acc_file # Imputed_accessibility.pkl
Int cpu = 24
Int memory = 256
String docker = "dyeramosu/scenic_plus_terra:1.0.0"
Int preemptible = 0
Int disk_space = 128
}
command <<<
set -e
mkdir tmpdir
mkdir get_scores_output_wdl
python << CODE
# imports
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import pickle
from pycisTopic.cistopic_class import *
# LOAD OBJECTS
cistopic_obj = pickle.load(open('~{cistopic_file}', 'rb'))
imputed_acc_obj = pickle.load(open('~{imputed_acc_file}', 'rb'))
DARs_dict = pickle.load(open('~{DARs_file}', 'rb'))
# RETRIEVE GENE ANNOTATION AND CHROMOSOME SIZES FOR OUR GENOME
## get TSS annotations
import pybiomart as pbm
import pyranges as pr
dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://sep2019.archive.ensembl.org/')
annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name',
'transcription_start_site', 'transcript_biotype'])
annot['Chromosome/scaffold name'] = 'chr' + annot['Chromosome/scaffold name'].astype(str)
annot.columns=['Chromosome', 'Start', 'End', 'Strand', 'Gene','Transcription_Start_Site', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']
annot.Strand[annot.Strand == 1] = '+'
annot.Strand[annot.Strand == -1] = '-'
pr_annotation = pr.PyRanges(annot.dropna(axis = 0))
## get chromosome sizes
import pandas as pd
import requests
target_url='http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes'
chromsizes=pd.read_csv(target_url, sep='\t', header=None)
chromsizes.columns=['Chromosome', 'End']
chromsizes['Start']=[0]*chromsizes.shape[0]
chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']]
chromsizes=pr.PyRanges(chromsizes)
# INFER GENE ACTIVITY
from pycisTopic.gene_activity import *
gene_act, weights = get_gene_activity(imputed_acc_obj, # Region-cell probabilities
pr_annotation, # Gene annotation
chromsizes, # Chromosome size
use_gene_boundaries=True, # Whether to use the whole search space or stop when encountering another gene
upstream=[1000, 100000], # Search space upstream. The minimum means that even if there is a gene right next to it
#these bp will be taken (1kbp here)
downstream=[1000,100000], # Search space downstream
distance_weight=True, # Whether to add a distance weight (an exponential function, the weight will decrease with distance)
decay_rate=1, # Exponent for the distance exponential funciton (the higher the faster will be the decrease)
extend_gene_body_upstream=10000, # Number of bp upstream immune to the distance weight (their value will be maximum for
#this weight)
extend_gene_body_downstream=500, # Number of bp downstream immune to the distance weight
gene_size_weight=False, # Whether to add a weights based on the length of the gene
gene_size_scale_factor='median', # Dividend to calculate the gene size weight. Default is the median value of all genes
#in the genome
remove_promoters=False, # Whether to remove promoters when computing gene activity scores
average_scores=True, # Whether to divide by the total number of region assigned to a gene when calculating the gene
#activity score
scale_factor=1, # Value to multiply for the final gene activity matrix
extend_tss=[10,10], # Space to consider a promoter
gini_weight = True, # Whether to add a gini index weight. The more unique the region is, the higher this weight will be
return_weights= True, # Whether to return the final weights
project='Gene_activity') # Project name for the gene activity object
# INFER DAGs
markers_dict = find_diff_features(cistopic_obj,
gene_act,
variable='states',
var_features=None,
contrasts=None,
adjpval_thr=0.05,
log2fc_thr=np.log2(1.5),
n_cpu=5,
_temp_dir=os.path.abspath("tmpdir/"),
split_pattern = '-')
# SAVE
with open(os.path.join('get_scores_output_wdl', 'Gene_activity.pkl'), 'wb') as f:
pickle.dump(gene_act, f)
with open(os.path.join('get_scores_output_wdl', 'weights.pkl'), 'wb') as f:
pickle.dump(weights, f)
with open(os.path.join('get_scores_output_wdl', 'DAGs.pkl'), 'wb') as f:
pickle.dump(markers_dict, f)
CODE
tar -czvf get_scores_output.tar.gz get_scores_output_wdl
gsutil rsync -r get_scores_output_wdl ~{output_dir}
>>>
output {
File scores_object = 'get_scores_output.tar.gz'
}
runtime {
docker: docker
memory: memory + "G"
bootDiskSizeGb: 12
disks: "local-disk " + disk_space + " HDD"
cpu: cpu
preemptible: preemptible
}
}