Skip to content

Commit d1d1f19

Browse files
committed
hic updates
1 parent 65b95f5 commit d1d1f19

File tree

4 files changed

+44
-29
lines changed

4 files changed

+44
-29
lines changed

src/compute_powerlaw_fit_from_hic.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def main():
5050

5151
def load_hic_juicebox(args):
5252
chromosomes = ['chr' + str(x) for x in list(range(1,23))] + ['chrX']
53+
#chromosomes = ['chr22']
5354
#file_list = glob.glob(os.path.join(args.hicDir,'chr*/chr*.KRobserved'))
5455

5556
all_data_list = []
@@ -85,11 +86,12 @@ def do_powerlaw_fit(HiC):
8586

8687
#TO DO:
8788
#Print out mean/var plot of powerlaw relationship
88-
HiC_summary = HiC.groupby('dist_for_fit').agg({'hic_kr' : 'sum'})
89-
HiC_summary['hic_kr'] = HiC_summary.hic_kr / HiC_summary.hic_kr.sum() #technically this normalization should be over the entire genome (not just to maxWindow). Will only affect intercept though..
90-
res = stats.linregress(np.log(HiC_summary.index), np.log(HiC_summary['hic_kr']))
89+
HiC_summary = HiC.groupby('dist_for_fit').agg({'hic_contact' : 'sum'})
90+
HiC_summary['hic_contact'] = HiC_summary.hic_contact / HiC_summary.hic_contact.sum() #technically this normalization should be over the entire genome (not just to maxWindow). Will only affect intercept though..
91+
res = stats.linregress(np.log(HiC_summary.index), np.log(HiC_summary['hic_contact']))
9192

92-
hic_mean_var = HiC.groupby('dist_for_fit').agg({'hic_kr' : ['mean','var']})
93+
hic_mean_var = HiC.groupby('dist_for_fit').agg({'hic_contact' : ['mean','var']})
94+
hic_mean_var.columns = ['mean', 'var']
9395

9496
return res.slope, res.intercept, hic_mean_var
9597

src/hic.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ def get_hic_file(chromosome, hic_dir, allow_vc=True):
88
hic_norm = os.path.join(hic_dir, chromosome, chromosome + ".KRnorm.gz")
99

1010
is_vc = False
11-
if allow_vc and not (os.path.exists(hic_file) and os.path.getsize(hic_file) > 0):
11+
if allow_vc and not hic_exists(hic_file):
1212
hic_file = os.path.join(hic_dir, chromosome, chromosome + ".VCobserved.gz")
1313
hic_norm = os.path.join(hic_dir, chromosome, chromosome + ".VCnorm.gz")
1414

15-
if not (os.path.exists(hic_file) and os.path.getsize(hic_file) > 0):
15+
if not hic_exists(hic_file):
1616
RuntimeError("Could not find KR or VC normalized hic files")
1717
else:
1818
print("Could not find KR normalized hic file. Using VC normalized hic file")
@@ -21,6 +21,15 @@ def get_hic_file(chromosome, hic_dir, allow_vc=True):
2121
print("Using: " + hic_file)
2222
return hic_file, hic_norm, is_vc
2323

24+
def hic_exists(file):
25+
if not os.path.exists(file):
26+
return False
27+
elif file.endswith('gz'):
28+
#gzip file still have some size. This is a hack
29+
return (os.path.getsize(file) > 100)
30+
else:
31+
return (os.path.getsize(file) > 0)
32+
2433
def load_hic(hic_file, hic_norm_file, hic_is_vc, hic_type, hic_resolution, tss_hic_contribution, window, min_window, gamma, interpolate_nan=True, apply_diagonal_bin_correction=True):
2534
print("Loading HiC")
2635

@@ -65,8 +74,12 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
6574
sums = sums[~np.isnan(sums)]
6675
assert(np.max(sums[sums > 0])/np.min(sums[sums > 0]) < 1.001)
6776
mean_sum = np.mean(sums[sums > 0])
68-
print('HiC Matrix has row sums of {}, making doubly stochastic...'.format(mean_sum))
69-
hic_mat = hic_mat.multiply(1/mean_sum)
77+
78+
if abs(mean_sum - 1) < .001:
79+
print('HiC Matrix has row sums of {}, continuing without making doubly stochastic'.format(mean_sum))
80+
else:
81+
print('HiC Matrix has row sums of {}, making doubly stochastic...'.format(mean_sum))
82+
hic_mat = hic_mat.multiply(1/mean_sum)
7083

7184
#Slow version. Its a constant scalar so don't need to to the matrix multiplication
7285
# kr_vec = np.repeat(np.sqrt(mean_sum), sums.shape[1])
@@ -105,7 +118,7 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
105118

106119
#Turn into dataframe
107120
hic_mat = hic_mat.tocoo(copy=False)
108-
hic_df = pd.DataFrame({'bin1': hic_mat.row, 'bin2': hic_mat.col, 'hic_kr': hic_mat.data})
121+
hic_df = pd.DataFrame({'bin1': hic_mat.row, 'bin2': hic_mat.col, 'hic_contact': hic_mat.data})
109122

110123
#Prune to window
111124
hic_df = hic_df.loc[np.logical_and(abs(hic_df['bin1'] - hic_df['bin2']) <= window/resolution, abs(hic_df['bin1'] - hic_df['bin2']) >= min_window/resolution)]
@@ -116,8 +129,8 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
116129
#So need to fill these. Use powerlaw.
117130
#Not ideal obviously but the scipy interpolation algos are either very slow or don't work since the nan structure implies that not all nans are interpolated
118131
if interpolate_nan:
119-
nan_loc = np.isnan(hic_df['hic_kr'])
120-
hic_df.loc[nan_loc,'hic_kr'] = get_powerlaw_at_distance(abs(hic_df.loc[nan_loc,'bin1'] - hic_df.loc[nan_loc,'bin2']) * resolution, gamma)
132+
nan_loc = np.isnan(hic_df['hic_contact'])
133+
hic_df.loc[nan_loc,'hic_contact'] = get_powerlaw_at_distance(abs(hic_df.loc[nan_loc,'bin1'] - hic_df.loc[nan_loc,'bin2']) * resolution, gamma)
121134

122135
print('process.hic: Elapsed time: {}'.format(time.time() - t))
123136

@@ -138,7 +151,7 @@ def apply_kr_threshold(hic_mat, hic_norm_file, kr_cutoff):
138151

139152
def hic_to_sparse(filename, norm_file, resolution, hic_is_doubly_stochastic=False):
140153
t = time.time()
141-
HiC = pd.read_table(filename, names=["bin1", "bin2", "hic_kr"],
154+
HiC = pd.read_table(filename, names=["bin1", "bin2", "hic_contact"],
142155
header=None, engine='c', memory_map=True)
143156

144157
# verify our assumptions
@@ -153,7 +166,7 @@ def hic_to_sparse(filename, norm_file, resolution, hic_is_doubly_stochastic=Fals
153166
# accumulates repeated indices, so this will do the right thing.
154167
row = np.floor(HiC.bin1.values / resolution).astype(int)
155168
col = np.floor(HiC.bin2.values / resolution).astype(int)
156-
dat = HiC.hic_kr.values
169+
dat = HiC.hic_contact.values
157170

158171
#JN: Need both triangles in order to compute row/column sums to make double stochastic.
159172
#If juicebox is upgraded to return DS matrices, then can remove one triangle

src/makeAverageHiC.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from tools import write_params
77
import pyranges
88

9+
# To do
10+
# Final output matrix needs to be KR normed as well
911
def parseargs():
1012
class formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
1113
pass
@@ -89,7 +91,7 @@ def main():
8991
all_hic = all_hic.loc[np.logical_or(all_hic['avg_hic'] > 0, np.isnan(all_hic['avg_hic'])), ] # why do these 0's exist?
9092

9193
os.makedirs(os.path.join(args.outDir, args.chromosome), exist_ok=True)
92-
all_hic.to_csv(os.path.join(args.outDir, args.chromosome, args.chromosome + ".KRobserved.gz"), sep="\t", header=False, index=False, compression="gzip", na_rep=np.nan)
94+
all_hic.to_csv(os.path.join(args.outDir, args.chromosome, args.chromosome + ".avg.gz"), sep="\t", header=False, index=False, compression="gzip", na_rep=np.nan)
9395

9496
def scale_hic_with_powerlaw(hic, resolution, scale_ref, gamma_ref, scale, gamma):
9597

src/predictor.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def make_predictions(chromosome, enhancers, genes, args):
1313
hic_file, hic_norm_file, hic_is_vc = get_hic_file(chromosome, args.HiCdir)
1414
pred = add_hic_to_enh_gene_table(enhancers, genes, pred, hic_file, hic_norm_file, hic_is_vc, chromosome, args)
1515

16-
pred = compute_score(pred, [pred['activity_base'], pred['hic_kr_pl_scaled_adj']], "ABC")
16+
pred = compute_score(pred, [pred['activity_base'], pred['hic_contact_pl_scaled_adj']], "ABC")
1717
pred = compute_score(pred, [pred['activity_base'], pred['powerlaw_contact_reference']], "powerlaw")
1818

1919
return pred
@@ -81,22 +81,22 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
8181
#Overlap in one direction
8282
enh_hic1 = df_to_pyranges(enh, start_col = 'enh_midpoint', end_col = 'enh_midpoint', end_slop = 1).join(hic1).df
8383
genes_hic2 = df_to_pyranges(genes, start_col = 'TargetGeneTSS', end_col = 'TargetGeneTSS', end_slop = 1).join(hic2).df
84-
ovl12 = enh_hic1[['enh_idx','hic_idx','hic_kr']].merge(genes_hic2[['gene_idx', 'hic_idx']], on = 'hic_idx')
84+
ovl12 = enh_hic1[['enh_idx','hic_idx','hic_contact']].merge(genes_hic2[['gene_idx', 'hic_idx']], on = 'hic_idx')
8585

8686
#Overlap in the other direction
8787
enh_hic2 = df_to_pyranges(enh, start_col = 'enh_midpoint', end_col = 'enh_midpoint', end_slop = 1).join(hic2).df
8888
genes_hic1 = df_to_pyranges(genes, start_col = 'TargetGeneTSS', end_col = 'TargetGeneTSS', end_slop = 1).join(hic1).df
89-
ovl21 = enh_hic2[['enh_idx','hic_idx','hic_kr']].merge(genes_hic1[['gene_idx', 'hic_idx']], on = ['hic_idx'])
89+
ovl21 = enh_hic2[['enh_idx','hic_idx','hic_contact']].merge(genes_hic1[['gene_idx', 'hic_idx']], on = ['hic_idx'])
9090

9191
#Concatenate both directions and merge into preditions
9292
ovl = pd.concat([ovl12, ovl21]).drop_duplicates()
9393
pred = pred.merge(ovl, on = ['enh_idx', 'gene_idx'], how = 'left')
94-
pred.fillna(value={'hic_kr' : 0}, inplace=True)
94+
pred.fillna(value={'hic_contact' : 0}, inplace=True)
9595
elif args.hic_type == "juicebox":
9696
#Merge directly using indices
9797
#Could also do this by indexing into the sparse matrix (instead of merge) but this seems to be slower
9898
#Index into sparse matrix
99-
#pred['hic_kr'] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
99+
#pred['hic_contact'] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
100100

101101
pred['enh_bin'] = np.floor(pred['enh_midpoint'] / args.hic_resolution).astype(int)
102102
pred['tss_bin'] = np.floor(pred['TargetGeneTSS'] / args.hic_resolution).astype(int)
@@ -106,13 +106,13 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
106106
pred['bin1'] = np.amin(pred[['enh_bin', 'tss_bin']], axis = 1)
107107
pred['bin2'] = np.amax(pred[['enh_bin', 'tss_bin']], axis = 1)
108108
pred = pred.merge(HiC, how = 'left', on = ['bin1','bin2'])
109-
pred.fillna(value={'hic_kr' : 0}, inplace=True)
109+
pred.fillna(value={'hic_contact' : 0}, inplace=True)
110110
else:
111111
# The matrix is not triangular, its full
112112
# For VC assume genes correspond to rows and columns to enhancers
113113
pred = pred.merge(HiC, how = 'left', left_on = ['tss_bin','enh_bin'], right_on=['bin1','bin2'])
114114

115-
pred.fillna(value={'hic_kr' : 0}, inplace=True)
115+
pred.fillna(value={'hic_contact' : 0}, inplace=True)
116116

117117
# QC juicebox HiC
118118
pred = qc_hic(pred)
@@ -136,13 +136,13 @@ def scale_with_powerlaw(pred, args):
136136
#Scale hic values to
137137

138138
if not args.scale_hic_using_powerlaw:
139-
pred['hic_kr_pl_scaled'] = pred['hic_kr']
139+
pred['hic_contact_pl_scaled'] = pred['hic_contact']
140140
else:
141141
powerlaw_estimate = get_powerlaw_at_distance(pred['distance'].values, args.hic_gamma)
142142
powerlaw_estimate_reference = get_powerlaw_at_distance(pred['distance'].values, args.hic_gamma_reference)
143143
pred['powerlaw_contact'] = powerlaw_estimate
144144
pred['powerlaw_contact_reference'] = powerlaw_estimate_reference
145-
pred['hic_kr_pl_scaled'] = pred['hic_kr'] * (powerlaw_estimate_reference / powerlaw_estimate)
145+
pred['hic_contact_pl_scaled'] = pred['hic_contact'] * (powerlaw_estimate_reference / powerlaw_estimate)
146146

147147
return(pred)
148148

@@ -154,22 +154,20 @@ def add_hic_pseudocount(pred, args):
154154

155155
pseudocount = np.amin(pd.DataFrame({'a' : powerlaw_fit, 'b' : powerlaw_fit_at_ref}), axis = 1)
156156
pred['hic_pseudocount'] = pseudocount
157-
pred['hic_kr_pl_scaled_adj'] = pred['hic_kr_pl_scaled'] + pseudocount
157+
pred['hic_contact_pl_scaled_adj'] = pred['hic_contact_pl_scaled'] + pseudocount
158158

159159
return(pred)
160160

161161
def qc_hic(pred, threshold = .01):
162162
# Genes with insufficient hic coverage should get nan'd
163163

164-
summ = pred.loc[pred['isSelfPromoter'],:].groupby(['TargetGene']).agg({'hic_kr' : 'sum'})
165-
bad_genes = summ.loc[summ['hic_kr'] < threshold,:].index
164+
summ = pred.loc[pred['isSelfPromoter'],:].groupby(['TargetGene']).agg({'hic_contact' : 'sum'})
165+
bad_genes = summ.loc[summ['hic_contact'] < threshold,:].index
166166

167-
pred.loc[pred['TargetGene'].isin(bad_genes), 'hic_kr'] = np.nan
167+
pred.loc[pred['TargetGene'].isin(bad_genes), 'hic_contact'] = np.nan
168168

169169
return pred
170170

171-
172-
173171
def compute_score(enhancers, product_terms, prefix):
174172

175173
scores = np.column_stack(product_terms).prod(axis = 1)

0 commit comments

Comments
 (0)