@@ -13,7 +13,7 @@ def make_predictions(chromosome, enhancers, genes, args):
13
13
hic_file , hic_norm_file , hic_is_vc = get_hic_file (chromosome , args .HiCdir )
14
14
pred = add_hic_to_enh_gene_table (enhancers , genes , pred , hic_file , hic_norm_file , hic_is_vc , chromosome , args )
15
15
16
- pred = compute_score (pred , [pred ['activity_base' ], pred ['hic_kr_pl_scaled_adj ' ]], "ABC" )
16
+ pred = compute_score (pred , [pred ['activity_base' ], pred ['hic_contact_pl_scaled_adj ' ]], "ABC" )
17
17
pred = compute_score (pred , [pred ['activity_base' ], pred ['powerlaw_contact_reference' ]], "powerlaw" )
18
18
19
19
return pred
@@ -81,22 +81,22 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
81
81
#Overlap in one direction
82
82
enh_hic1 = df_to_pyranges (enh , start_col = 'enh_midpoint' , end_col = 'enh_midpoint' , end_slop = 1 ).join (hic1 ).df
83
83
genes_hic2 = df_to_pyranges (genes , start_col = 'TargetGeneTSS' , end_col = 'TargetGeneTSS' , end_slop = 1 ).join (hic2 ).df
84
- ovl12 = enh_hic1 [['enh_idx' ,'hic_idx' ,'hic_kr ' ]].merge (genes_hic2 [['gene_idx' , 'hic_idx' ]], on = 'hic_idx' )
84
+ ovl12 = enh_hic1 [['enh_idx' ,'hic_idx' ,'hic_contact ' ]].merge (genes_hic2 [['gene_idx' , 'hic_idx' ]], on = 'hic_idx' )
85
85
86
86
#Overlap in the other direction
87
87
enh_hic2 = df_to_pyranges (enh , start_col = 'enh_midpoint' , end_col = 'enh_midpoint' , end_slop = 1 ).join (hic2 ).df
88
88
genes_hic1 = df_to_pyranges (genes , start_col = 'TargetGeneTSS' , end_col = 'TargetGeneTSS' , end_slop = 1 ).join (hic1 ).df
89
- ovl21 = enh_hic2 [['enh_idx' ,'hic_idx' ,'hic_kr ' ]].merge (genes_hic1 [['gene_idx' , 'hic_idx' ]], on = ['hic_idx' ])
89
+ ovl21 = enh_hic2 [['enh_idx' ,'hic_idx' ,'hic_contact ' ]].merge (genes_hic1 [['gene_idx' , 'hic_idx' ]], on = ['hic_idx' ])
90
90
91
91
#Concatenate both directions and merge into preditions
92
92
ovl = pd .concat ([ovl12 , ovl21 ]).drop_duplicates ()
93
93
pred = pred .merge (ovl , on = ['enh_idx' , 'gene_idx' ], how = 'left' )
94
- pred .fillna (value = {'hic_kr ' : 0 }, inplace = True )
94
+ pred .fillna (value = {'hic_contact ' : 0 }, inplace = True )
95
95
elif args .hic_type == "juicebox" :
96
96
#Merge directly using indices
97
97
#Could also do this by indexing into the sparse matrix (instead of merge) but this seems to be slower
98
98
#Index into sparse matrix
99
- #pred['hic_kr '] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
99
+ #pred['hic_contact '] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
100
100
101
101
pred ['enh_bin' ] = np .floor (pred ['enh_midpoint' ] / args .hic_resolution ).astype (int )
102
102
pred ['tss_bin' ] = np .floor (pred ['TargetGeneTSS' ] / args .hic_resolution ).astype (int )
@@ -106,13 +106,13 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
106
106
pred ['bin1' ] = np .amin (pred [['enh_bin' , 'tss_bin' ]], axis = 1 )
107
107
pred ['bin2' ] = np .amax (pred [['enh_bin' , 'tss_bin' ]], axis = 1 )
108
108
pred = pred .merge (HiC , how = 'left' , on = ['bin1' ,'bin2' ])
109
- pred .fillna (value = {'hic_kr ' : 0 }, inplace = True )
109
+ pred .fillna (value = {'hic_contact ' : 0 }, inplace = True )
110
110
else :
111
111
# The matrix is not triangular, its full
112
112
# For VC assume genes correspond to rows and columns to enhancers
113
113
pred = pred .merge (HiC , how = 'left' , left_on = ['tss_bin' ,'enh_bin' ], right_on = ['bin1' ,'bin2' ])
114
114
115
- pred .fillna (value = {'hic_kr ' : 0 }, inplace = True )
115
+ pred .fillna (value = {'hic_contact ' : 0 }, inplace = True )
116
116
117
117
# QC juicebox HiC
118
118
pred = qc_hic (pred )
@@ -136,13 +136,13 @@ def scale_with_powerlaw(pred, args):
136
136
#Scale hic values to
137
137
138
138
if not args .scale_hic_using_powerlaw :
139
- pred ['hic_kr_pl_scaled ' ] = pred ['hic_kr ' ]
139
+ pred ['hic_contact_pl_scaled ' ] = pred ['hic_contact ' ]
140
140
else :
141
141
powerlaw_estimate = get_powerlaw_at_distance (pred ['distance' ].values , args .hic_gamma )
142
142
powerlaw_estimate_reference = get_powerlaw_at_distance (pred ['distance' ].values , args .hic_gamma_reference )
143
143
pred ['powerlaw_contact' ] = powerlaw_estimate
144
144
pred ['powerlaw_contact_reference' ] = powerlaw_estimate_reference
145
- pred ['hic_kr_pl_scaled ' ] = pred ['hic_kr ' ] * (powerlaw_estimate_reference / powerlaw_estimate )
145
+ pred ['hic_contact_pl_scaled ' ] = pred ['hic_contact ' ] * (powerlaw_estimate_reference / powerlaw_estimate )
146
146
147
147
return (pred )
148
148
@@ -154,22 +154,20 @@ def add_hic_pseudocount(pred, args):
154
154
155
155
pseudocount = np .amin (pd .DataFrame ({'a' : powerlaw_fit , 'b' : powerlaw_fit_at_ref }), axis = 1 )
156
156
pred ['hic_pseudocount' ] = pseudocount
157
- pred ['hic_kr_pl_scaled_adj ' ] = pred ['hic_kr_pl_scaled ' ] + pseudocount
157
+ pred ['hic_contact_pl_scaled_adj ' ] = pred ['hic_contact_pl_scaled ' ] + pseudocount
158
158
159
159
return (pred )
160
160
161
161
def qc_hic (pred , threshold = .01 ):
162
162
# Genes with insufficient hic coverage should get nan'd
163
163
164
- summ = pred .loc [pred ['isSelfPromoter' ],:].groupby (['TargetGene' ]).agg ({'hic_kr ' : 'sum' })
165
- bad_genes = summ .loc [summ ['hic_kr ' ] < threshold ,:].index
164
+ summ = pred .loc [pred ['isSelfPromoter' ],:].groupby (['TargetGene' ]).agg ({'hic_contact ' : 'sum' })
165
+ bad_genes = summ .loc [summ ['hic_contact ' ] < threshold ,:].index
166
166
167
- pred .loc [pred ['TargetGene' ].isin (bad_genes ), 'hic_kr ' ] = np .nan
167
+ pred .loc [pred ['TargetGene' ].isin (bad_genes ), 'hic_contact ' ] = np .nan
168
168
169
169
return pred
170
170
171
-
172
-
173
171
def compute_score (enhancers , product_terms , prefix ):
174
172
175
173
scores = np .column_stack (product_terms ).prod (axis = 1 )
0 commit comments