hic updates

jnasser3 · jnasser3 · commit d1d1f193e049 · 2019-11-04T14:47:28.000-05:00
diff --git a/src/compute_powerlaw_fit_from_hic.py b/src/compute_powerlaw_fit_from_hic.py
@@ -50,6 +50,7 @@ def main():
 
 def load_hic_juicebox(args):
     chromosomes = ['chr' + str(x) for x in  list(range(1,23))] + ['chrX']
+    #chromosomes = ['chr22']
     #file_list = glob.glob(os.path.join(args.hicDir,'chr*/chr*.KRobserved'))
 
     all_data_list = []
@@ -85,11 +86,12 @@ def do_powerlaw_fit(HiC):
 
     #TO DO:
     #Print out mean/var plot of powerlaw relationship
-    HiC_summary = HiC.groupby('dist_for_fit').agg({'hic_kr' : 'sum'})
-    HiC_summary['hic_kr'] = HiC_summary.hic_kr / HiC_summary.hic_kr.sum() #technically this normalization should be over the entire genome (not just to maxWindow). Will only affect intercept though..
-    res = stats.linregress(np.log(HiC_summary.index), np.log(HiC_summary['hic_kr']))
+    HiC_summary = HiC.groupby('dist_for_fit').agg({'hic_contact' : 'sum'})
+    HiC_summary['hic_contact'] = HiC_summary.hic_contact / HiC_summary.hic_contact.sum() #technically this normalization should be over the entire genome (not just to maxWindow). Will only affect intercept though..
+    res = stats.linregress(np.log(HiC_summary.index), np.log(HiC_summary['hic_contact']))
 
-    hic_mean_var = HiC.groupby('dist_for_fit').agg({'hic_kr' : ['mean','var']})
+    hic_mean_var = HiC.groupby('dist_for_fit').agg({'hic_contact' : ['mean','var']})
+    hic_mean_var.columns = ['mean', 'var']
 
     return res.slope, res.intercept, hic_mean_var
 
diff --git a/src/hic.py b/src/hic.py
@@ -8,11 +8,11 @@ def get_hic_file(chromosome, hic_dir, allow_vc=True):
     hic_norm = os.path.join(hic_dir, chromosome, chromosome + ".KRnorm.gz")
 
     is_vc = False
-    if allow_vc and not (os.path.exists(hic_file) and os.path.getsize(hic_file) > 0):
+    if allow_vc and not hic_exists(hic_file):
         hic_file = os.path.join(hic_dir, chromosome, chromosome + ".VCobserved.gz")
         hic_norm = os.path.join(hic_dir, chromosome, chromosome + ".VCnorm.gz")
 
-        if not (os.path.exists(hic_file) and os.path.getsize(hic_file) > 0):
+        if not hic_exists(hic_file):
             RuntimeError("Could not find KR or VC normalized hic files")
         else:
             print("Could not find KR normalized hic file. Using VC normalized hic file")
@@ -21,6 +21,15 @@ def get_hic_file(chromosome, hic_dir, allow_vc=True):
     print("Using: " + hic_file)
     return hic_file, hic_norm, is_vc
 
+def hic_exists(file):
+    if not os.path.exists(file):
+        return False
+    elif file.endswith('gz'):
+        #gzip file still have some size. This is a hack
+        return (os.path.getsize(file) > 100)
+    else:
+        return (os.path.getsize(file) > 0)
+
 def load_hic(hic_file, hic_norm_file, hic_is_vc, hic_type, hic_resolution, tss_hic_contribution, window, min_window, gamma, interpolate_nan=True, apply_diagonal_bin_correction=True):
     print("Loading HiC")
 
@@ -65,8 +74,12 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
         sums = sums[~np.isnan(sums)]
         assert(np.max(sums[sums > 0])/np.min(sums[sums > 0]) < 1.001)
         mean_sum = np.mean(sums[sums > 0])
-        print('HiC Matrix has row sums of {}, making doubly stochastic...'.format(mean_sum))
-        hic_mat = hic_mat.multiply(1/mean_sum)
+
+        if abs(mean_sum - 1) < .001:
+            print('HiC Matrix has row sums of {}, continuing without making doubly stochastic'.format(mean_sum))
+        else:
+            print('HiC Matrix has row sums of {}, making doubly stochastic...'.format(mean_sum))
+            hic_mat = hic_mat.multiply(1/mean_sum)
 
     #Slow version. Its a constant scalar so don't need to to the matrix multiplication
     # kr_vec = np.repeat(np.sqrt(mean_sum), sums.shape[1])
@@ -105,7 +118,7 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
 
     #Turn into dataframe
     hic_mat = hic_mat.tocoo(copy=False)
-    hic_df = pd.DataFrame({'bin1': hic_mat.row, 'bin2': hic_mat.col, 'hic_kr': hic_mat.data})
+    hic_df = pd.DataFrame({'bin1': hic_mat.row, 'bin2': hic_mat.col, 'hic_contact': hic_mat.data})
 
     #Prune to window
     hic_df = hic_df.loc[np.logical_and(abs(hic_df['bin1'] - hic_df['bin2']) <= window/resolution, abs(hic_df['bin1'] - hic_df['bin2']) >= min_window/resolution)]
@@ -116,8 +129,8 @@ def process_hic(hic_mat, hic_norm_file, hic_is_vc, resolution, tss_hic_contribut
     #So need to fill these. Use powerlaw. 
     #Not ideal obviously but the scipy interpolation algos are either very slow or don't work since the nan structure implies that not all nans are interpolated
     if interpolate_nan:
-        nan_loc = np.isnan(hic_df['hic_kr'])
-        hic_df.loc[nan_loc,'hic_kr'] = get_powerlaw_at_distance(abs(hic_df.loc[nan_loc,'bin1'] - hic_df.loc[nan_loc,'bin2']) * resolution, gamma)
+        nan_loc = np.isnan(hic_df['hic_contact'])
+        hic_df.loc[nan_loc,'hic_contact'] = get_powerlaw_at_distance(abs(hic_df.loc[nan_loc,'bin1'] - hic_df.loc[nan_loc,'bin2']) * resolution, gamma)
 
     print('process.hic: Elapsed time: {}'.format(time.time() - t))
 
@@ -138,7 +151,7 @@ def apply_kr_threshold(hic_mat, hic_norm_file, kr_cutoff):
 
 def hic_to_sparse(filename, norm_file, resolution, hic_is_doubly_stochastic=False):
     t = time.time()
-    HiC = pd.read_table(filename, names=["bin1", "bin2", "hic_kr"],
+    HiC = pd.read_table(filename, names=["bin1", "bin2", "hic_contact"],
                         header=None, engine='c', memory_map=True)
 
     # verify our assumptions
@@ -153,7 +166,7 @@ def hic_to_sparse(filename, norm_file, resolution, hic_is_doubly_stochastic=Fals
     # accumulates repeated indices, so this will do the right thing.
     row = np.floor(HiC.bin1.values / resolution).astype(int)
     col = np.floor(HiC.bin2.values / resolution).astype(int)
-    dat = HiC.hic_kr.values
+    dat = HiC.hic_contact.values
 
     #JN: Need both triangles in order to compute row/column sums to make double stochastic.
     #If juicebox is upgraded to return DS matrices, then can remove one triangle
diff --git a/src/makeAverageHiC.py b/src/makeAverageHiC.py
@@ -6,6 +6,8 @@
 from tools import write_params
 import pyranges
 
+# To do
+# Final output matrix needs to be KR normed as well
 def parseargs():
     class formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
         pass
@@ -89,7 +91,7 @@ def main():
     all_hic = all_hic.loc[np.logical_or(all_hic['avg_hic'] > 0, np.isnan(all_hic['avg_hic'])), ] # why do these 0's exist?
 
     os.makedirs(os.path.join(args.outDir, args.chromosome), exist_ok=True)
-    all_hic.to_csv(os.path.join(args.outDir, args.chromosome, args.chromosome + ".KRobserved.gz"), sep="\t", header=False, index=False, compression="gzip", na_rep=np.nan)
+    all_hic.to_csv(os.path.join(args.outDir, args.chromosome, args.chromosome + ".avg.gz"), sep="\t", header=False, index=False, compression="gzip", na_rep=np.nan)
 
 def scale_hic_with_powerlaw(hic, resolution, scale_ref, gamma_ref, scale, gamma):
 
diff --git a/src/predictor.py b/src/predictor.py
@@ -13,7 +13,7 @@ def make_predictions(chromosome, enhancers, genes, args):
     hic_file, hic_norm_file, hic_is_vc = get_hic_file(chromosome, args.HiCdir)
     pred = add_hic_to_enh_gene_table(enhancers, genes, pred, hic_file, hic_norm_file, hic_is_vc, chromosome, args)
 
-    pred = compute_score(pred, [pred['activity_base'], pred['hic_kr_pl_scaled_adj']], "ABC")
+    pred = compute_score(pred, [pred['activity_base'], pred['hic_contact_pl_scaled_adj']], "ABC")
     pred = compute_score(pred, [pred['activity_base'], pred['powerlaw_contact_reference']], "powerlaw")
 
     return pred
@@ -81,22 +81,22 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
         #Overlap in one direction
         enh_hic1 = df_to_pyranges(enh, start_col = 'enh_midpoint', end_col = 'enh_midpoint', end_slop = 1).join(hic1).df
         genes_hic2 = df_to_pyranges(genes, start_col = 'TargetGeneTSS', end_col = 'TargetGeneTSS', end_slop = 1).join(hic2).df
-        ovl12 = enh_hic1[['enh_idx','hic_idx','hic_kr']].merge(genes_hic2[['gene_idx', 'hic_idx']], on = 'hic_idx')
+        ovl12 = enh_hic1[['enh_idx','hic_idx','hic_contact']].merge(genes_hic2[['gene_idx', 'hic_idx']], on = 'hic_idx')
 
         #Overlap in the other direction
         enh_hic2 = df_to_pyranges(enh, start_col = 'enh_midpoint', end_col = 'enh_midpoint', end_slop = 1).join(hic2).df
         genes_hic1 = df_to_pyranges(genes, start_col = 'TargetGeneTSS', end_col = 'TargetGeneTSS', end_slop = 1).join(hic1).df
-        ovl21 = enh_hic2[['enh_idx','hic_idx','hic_kr']].merge(genes_hic1[['gene_idx', 'hic_idx']], on = ['hic_idx'])
+        ovl21 = enh_hic2[['enh_idx','hic_idx','hic_contact']].merge(genes_hic1[['gene_idx', 'hic_idx']], on = ['hic_idx'])
 
         #Concatenate both directions and merge into preditions
         ovl = pd.concat([ovl12, ovl21]).drop_duplicates()
         pred = pred.merge(ovl, on = ['enh_idx', 'gene_idx'], how = 'left')
-        pred.fillna(value={'hic_kr' : 0}, inplace=True)
+        pred.fillna(value={'hic_contact' : 0}, inplace=True)
     elif args.hic_type == "juicebox":
         #Merge directly using indices
         #Could also do this by indexing into the sparse matrix (instead of merge) but this seems to be slower
         #Index into sparse matrix
-        #pred['hic_kr'] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
+        #pred['hic_contact'] = [HiC[i,j] for (i,j) in pred[['enh_bin','tss_bin']].values.tolist()]
         
         pred['enh_bin'] = np.floor(pred['enh_midpoint'] / args.hic_resolution).astype(int)
         pred['tss_bin'] = np.floor(pred['TargetGeneTSS'] / args.hic_resolution).astype(int)
@@ -106,13 +106,13 @@ def add_hic_to_enh_gene_table(enh, genes, pred, hic_file, hic_norm_file, hic_is_
             pred['bin1'] = np.amin(pred[['enh_bin', 'tss_bin']], axis = 1)
             pred['bin2'] = np.amax(pred[['enh_bin', 'tss_bin']], axis = 1)
             pred = pred.merge(HiC, how = 'left', on = ['bin1','bin2'])
-            pred.fillna(value={'hic_kr' : 0}, inplace=True)
+            pred.fillna(value={'hic_contact' : 0}, inplace=True)
         else:
             # The matrix is not triangular, its full
             # For VC assume genes correspond to rows and columns to enhancers
             pred = pred.merge(HiC, how = 'left', left_on = ['tss_bin','enh_bin'], right_on=['bin1','bin2'])
 
-        pred.fillna(value={'hic_kr' : 0}, inplace=True)
+        pred.fillna(value={'hic_contact' : 0}, inplace=True)
 
         # QC juicebox HiC
         pred = qc_hic(pred)
@@ -136,13 +136,13 @@ def scale_with_powerlaw(pred, args):
     #Scale hic values to
 
     if not args.scale_hic_using_powerlaw:
-        pred['hic_kr_pl_scaled'] = pred['hic_kr']
+        pred['hic_contact_pl_scaled'] = pred['hic_contact']
     else:
         powerlaw_estimate = get_powerlaw_at_distance(pred['distance'].values, args.hic_gamma)
         powerlaw_estimate_reference = get_powerlaw_at_distance(pred['distance'].values, args.hic_gamma_reference)
         pred['powerlaw_contact'] = powerlaw_estimate
         pred['powerlaw_contact_reference'] = powerlaw_estimate_reference
-        pred['hic_kr_pl_scaled'] = pred['hic_kr'] * (powerlaw_estimate_reference / powerlaw_estimate)
+        pred['hic_contact_pl_scaled'] = pred['hic_contact'] * (powerlaw_estimate_reference / powerlaw_estimate)
 
     return(pred)
 
@@ -154,22 +154,20 @@ def add_hic_pseudocount(pred, args):
     
     pseudocount = np.amin(pd.DataFrame({'a' : powerlaw_fit, 'b' : powerlaw_fit_at_ref}), axis = 1)
     pred['hic_pseudocount'] = pseudocount
-    pred['hic_kr_pl_scaled_adj'] = pred['hic_kr_pl_scaled'] + pseudocount
+    pred['hic_contact_pl_scaled_adj'] = pred['hic_contact_pl_scaled'] + pseudocount
 
     return(pred)
 
 def qc_hic(pred, threshold = .01):
     # Genes with insufficient hic coverage should get nan'd
 
-    summ = pred.loc[pred['isSelfPromoter'],:].groupby(['TargetGene']).agg({'hic_kr' : 'sum'})
-    bad_genes = summ.loc[summ['hic_kr'] < threshold,:].index
+    summ = pred.loc[pred['isSelfPromoter'],:].groupby(['TargetGene']).agg({'hic_contact' : 'sum'})
+    bad_genes = summ.loc[summ['hic_contact'] < threshold,:].index
 
-    pred.loc[pred['TargetGene'].isin(bad_genes), 'hic_kr'] = np.nan
+    pred.loc[pred['TargetGene'].isin(bad_genes), 'hic_contact'] = np.nan
 
     return pred
 
-
-
 def compute_score(enhancers, product_terms, prefix):
 
     scores = np.column_stack(product_terms).prod(axis = 1)