resubmission

berenslab · May 23, 2019 · 694d440 · 694d440
1 parent 245c72a
commit 694d440
Show file tree

Hide file tree

Showing 46 changed files with 36,395 additions and 6,826 deletions.
diff --git a/data/harris-colormap.txt b/data/harris-colormap.txt
@@ -0,0 +1,49 @@
+    0.7692         0         0
+    0.7134    0.0726         0
+    0.6652    0.1353         0
+    0.6230    0.1901         0
+    0.5859    0.2383         0
+    0.5530    0.2812         0
+    0.5235    0.3194         0
+    0.4971    0.3538         0
+    0.4731    0.3849         0
+    0.4514    0.4132         0
+    0.4306    0.4380         0
+    0.4040    0.4584         0
+    0.3749    0.4808         0
+    0.3428    0.5056         0
+    0.3071    0.5330         0
+    0.2674    0.5635         0
+    0.2229    0.5978         0
+    0.1726    0.6365         0
+    0.1153    0.6805         0
+    0.0496    0.7311         0
+         0    0.7692    0.0261
+         0    0.7692    0.1043
+         0    0.7692    0.1825
+         0    0.7692    0.2608
+         0    0.7692    0.3390
+         0    0.7692    0.4172
+         0    0.7692    0.4954
+         0    0.7692    0.5737
+         0    0.7692    0.6519
+         0    0.7692    0.7301
+         0    0.7598    0.8005
+         0    0.7386    0.8715
+         0    0.7131    0.9562
+         0    0.6441    1.0000
+         0    0.5424    1.0000
+         0    0.4407    1.0000
+         0    0.3390    1.0000
+         0    0.2373    1.0000
+         0    0.1356    1.0000
+         0    0.0339    1.0000
+    0.0678         0    1.0000
+    0.1695         0    1.0000
+    0.2712         0    1.0000
+    0.3729         0    1.0000
+    0.4746         0    1.0000
+    0.5763         0    1.0000
+    0.6780         0    1.0000
+    0.7221         0    0.9262
+    0.7461         0    0.8465
diff --git a/data/tasic-sample_heatmap_plot_data.csv b/data/tasic-sample_heatmap_plot_data.csv
diff --git a/demo.ipynb b/demo.ipynb
diff --git a/figures/cadwell-mapping-600.pdf b/figures/cadwell-mapping-600.pdf
diff --git a/figures/cadwell-mapping.pdf b/figures/cadwell-mapping.pdf
diff --git a/figures/cadwell-mapping.png b/figures/cadwell-mapping.png
diff --git a/figures/featureSelection-600.pdf b/figures/featureSelection-600.pdf
diff --git a/figures/featureSelection.pdf b/figures/featureSelection.pdf
diff --git a/figures/featureSelection.png b/figures/featureSelection.png
diff --git a/figures/mln-600.pdf b/figures/mln-600.pdf
diff --git a/figures/mln-markers-600.pdf b/figures/mln-markers-600.pdf
diff --git a/figures/mln-markers.pdf b/figures/mln-markers.pdf
diff --git a/figures/mln-markers.png b/figures/mln-markers.png
diff --git a/figures/mln.pdf b/figures/mln.pdf
diff --git a/figures/mln.png b/figures/mln.png
diff --git a/figures/scan.pdf b/figures/scan.pdf
diff --git a/figures/scan.png b/figures/scan.png
diff --git a/figures/tasic-subsets-pca-600.pdf b/figures/tasic-subsets-pca-600.pdf
diff --git a/figures/tasic-subsets-pca.pdf b/figures/tasic-subsets-pca.pdf
diff --git a/figures/tasic-subsets-pca.png b/figures/tasic-subsets-pca.png
diff --git a/figures/tasic-variants-600.pdf b/figures/tasic-variants-600.pdf
diff --git a/figures/tasic-variants.pdf b/figures/tasic-variants.pdf
diff --git a/figures/tasic-variants.png b/figures/tasic-variants.png
diff --git a/figures/toy-simulation-600.pdf b/figures/toy-simulation-600.pdf
diff --git a/figures/toy-simulation.pdf b/figures/toy-simulation.pdf
diff --git a/figures/toy-simulation.png b/figures/toy-simulation.png
diff --git a/figures/umap-big-600.pdf b/figures/umap-big-600.pdf
diff --git a/figures/umap-big.pdf b/figures/umap-big.pdf
diff --git a/figures/umap-big.png b/figures/umap-big.png
diff --git a/figures/umap-small-600.pdf b/figures/umap-small-600.pdf
diff --git a/figures/umap-small.pdf b/figures/umap-small.pdf
diff --git a/figures/umap-small.png b/figures/umap-small.png
diff --git a/figures/umis-600.pdf b/figures/umis-600.pdf
diff --git a/figures/umis.pdf b/figures/umis.pdf
diff --git a/figures/umis.png b/figures/umis.png
diff --git a/million-cells.ipynb b/million-cells.ipynb
diff --git a/pickles/tasic_scan.pickle b/pickles/tasic_scan.pickle
diff --git a/pretty-perplexity.png b/pretty-perplexity.png
diff --git a/rnaseqTools.py b/rnaseqTools.py
@@ -28,7 +28,7 @@ def sparseload(filename, sep=',', dtype=float, chunksize=1000, index_col=0, drop
 
 
 def geneSelection(data, threshold=0, atleast=10, 
-                  yoffset=.02, xoffset=5, decay=1, n=None, 
+                  yoffset=.02, xoffset=5, decay=1.5, n=None, 
                   plot=True, markers=None, genes=None, figsize=(6,3.5),
                   markeroffsets=None, labelsize=10, alpha=1):
 
@@ -46,7 +46,6 @@ def geneSelection(data, threshold=0, atleast=10,
         meanExpr[detected] = np.nanmean(np.where(data[:,detected]>threshold, np.log2(data[:,detected]), np.nan), axis=0)
 
     lowDetection = np.array(np.sum(data>threshold, axis=0)).squeeze() < atleast
-    #lowDetection = (1 - zeroRate) * data.shape[0] < atleast - .00001
     zeroRate[lowDetection] = np.nan
     meanExpr[lowDetection] = np.nan
 
@@ -88,12 +87,12 @@ def geneSelection(data, threshold=0, atleast=10,
             plt.text(.4, 0.2, '{} genes selected\ny = exp(-{:.1f}*(x-{:.2f}))+{:.2f}'.format(np.sum(selected),decay,xoffset, yoffset), 
                      color='k', fontsize=labelsize, transform=plt.gca().transAxes)
 
-        plt.plot(x, y, color=sns.color_palette()[2], linewidth=2)
+        plt.plot(x, y, color=sns.color_palette()[1], linewidth=2)
         xy = np.concatenate((np.concatenate((x[:,None],y[:,None]),axis=1), np.array([[plt.xlim()[1], 1]])))
-        t = plt.matplotlib.patches.Polygon(xy, color='r', alpha=.2)
+        t = plt.matplotlib.patches.Polygon(xy, color=sns.color_palette()[1], alpha=.4)
         plt.gca().add_patch(t)
 
-        plt.scatter(meanExpr, zeroRate, s=3, alpha=alpha, rasterized=True)
+        plt.scatter(meanExpr, zeroRate, s=1, alpha=alpha, rasterized=True)
         if threshold==0:
             plt.xlabel('Mean log2 nonzero expression')
             plt.ylabel('Frequency of zero expression')
@@ -114,73 +113,6 @@ def geneSelection(data, threshold=0, atleast=10,
     return selected
 
 
-def scatterPlot(Z, dataset, size=4, s=3, title=None, labels_dy=2, 
-                rasterized=True, alpha=.5, showlabels=True, showmeans=True,
-                hideclustermeans=None, hideticklabels=True,
-                hideclusterlabels=[], markers=None,
-                showclusterlabels=None, clusterlabeloffsets=None,
-                clusterlabelcolor='k', clusterlabelsplit=False):
-    if size is not None:
-        plt.figure(figsize=(size,size))
-
-    plt.scatter(Z[:,0], Z[:,1], s=s, color=dataset.clusterColors[dataset.clusters], 
-                alpha=alpha, rasterized=rasterized)
-
-    K = dataset.clusterNames.size
-    Zmeans = np.zeros((K, 2))
-    for c in range(K):
-        Zmeans[c,:] = np.median(Z[dataset.clusters==c, :2], axis=0)
-    if hideclustermeans is not None:
-        Zmeans[hideclustermeans,:] = np.nan
-
-    if showmeans:
-        if markers is None:
-            markers = np.array(['o'] * K)
-        else:
-            markers = np.array(markers)
-        for m in np.unique(markers):
-            nonans = ~np.isnan(Zmeans[:,0])
-            plt.scatter(Zmeans[nonans,0][markers[nonans]==m], Zmeans[nonans,1][markers[nonans]==m],
-                        color=dataset.clusterColors[nonans][markers[nonans]==m], marker=m,
-                        s=40, edgecolor='k', linewidth=.7);
-
-    if showlabels:
-        if showclusterlabels is not None:
-            if isinstance(showclusterlabels[0], str):
-                showclusterlabels = [np.where(dataset.clusterNames==c)[0][0] for c in showclusterlabels]
-            else:
-                showclusterlabels = list(showclusterlabels)
-            hideclusterlabels = np.array([c for c in range(K) if c not in showclusterlabels])
-        for c in range(K):
-            if ~np.isnan(Zmeans[c,0]) and c not in hideclusterlabels:
-                if clusterlabeloffsets is not None and showclusterlabels is not None:
-                    dx,dy = clusterlabeloffsets[showclusterlabels.index(c)]
-                else:
-                    dx,dy = 0,labels_dy
-
-                if clusterlabelcolor is not None:
-                    col = clusterlabelcolor
-                else:
-                    col = dataset.clusterColors[c]
-
-                if clusterlabelsplit:
-                    label = '\n'.join(dataset.clusterNames[c].split(' '))
-                    hor = 'left'
-                else:
-                    label = dataset.clusterNames[c]
-                    hor = 'center'
-
-                plt.text(Zmeans[c,0]+dx, Zmeans[c,1]+dy, label, color=col, 
-                         fontsize=7, horizontalalignment=hor) 
-
-    if hideticklabels:
-        plt.gca().get_xaxis().set_ticklabels([])
-        plt.gca().get_yaxis().set_ticklabels([])
-    if title is not None:
-        plt.title(title)
-    if size is not None:
-        plt.tight_layout()
-
 
 # Computing the matrix of Euclidean distances
 def pdist2(A,B):
@@ -197,9 +129,11 @@ def corr2(A,B):
     return C
 
 def map_to_tsne(referenceCounts, referenceGenes, newCounts, newGenes, referenceAtlas, 
-                bootstrap = False, knn = 25, nrep = 100, seed = None, batchsize = 1000):
+                bootstrap = False, knn = 25, nrep = 100, seed = None, batchsize = 1000,
+				verbose = 1):
     gg = list(set(referenceGenes) & set(newGenes))
-    print('Using a common set of ' + str(len(gg)) + ' genes.')
+    if verbose > 0:
+        print('Using a common set of ' + str(len(gg)) + ' genes.')
 
     newGenes = [np.where(newGenes==g)[0][0] for g in gg]
     refGenes = [np.where(referenceGenes==g)[0][0] for g in gg]
@@ -215,33 +149,36 @@ def map_to_tsne(referenceCounts, referenceGenes, newCounts, newGenes, referenceA
     n = X.shape[0]
     assignmentPositions = np.zeros((n, referenceAtlas.shape[1]))
     batchCount = int(np.ceil(n/batchsize))
-    if batchCount > 1:
+    if (batchCount > 1) and (verbose > 0):
         print('Processing in batches', end='', flush=True) 
     for b in range(batchCount):
-        if batchCount > 1:
+        if (batchCount > 1) and (verbose > 0):
             print('.', end='', flush=True) 
         batch = np.arange(b*batchsize, np.minimum((b+1)*batchsize, n))
         C = corr2(X[batch,:], T)
+		ind = np.argpartition(C, -knn)[:, -knn:]
         for i in range(batch.size):
-            ind = np.argsort(C[i,:])[::-1][:knn]                 
-            assignmentPositions[batch[i],:] = np.median(referenceAtlas[ind,:], axis=0)
-    if batchCount > 1:
+            assignmentPositions[batch[i],:] = np.median(referenceAtlas[ind[i,:],:], axis=0)
+    if (batchCount > 1) and (verbose > 0):
         print(' done', flush=True) 
 
     # Note: currently bootstrapping does not support batchsize
     if bootstrap:
         if seed is not None:
             np.random.seed(seed)
         assignmentPositions_boot = np.zeros((n, referenceAtlas.shape[1], nrep))
-        print('Bootstrapping', end='', flush=True)
+        if verbose>0:
+            print('Bootstrapping', end='', flush=True)
         for rep in range(nrep):
-            print('.', end='')
+            if verbose>0:
+                print('.', end='')
             bootgenes = np.random.choice(T.shape[1], T.shape[1], replace=True)
             C_boot = corr2(X[:,bootgenes],T[:,bootgenes])
+			ind = np.argpartition(C, -knn)[:, -knn:]
             for i in range(X.shape[0]):
-                ind = np.argsort(C_boot[i,:])[::-1][:knn]                 
-                assignmentPositions_boot[i,:,rep] = np.median(referenceAtlas[ind,:], axis=0)
-        print(' done')      
+                assignmentPositions_boot[i,:,rep] = np.median(referenceAtlas[ind[i,:],:], axis=0)
+        if verbose>0:
+            print(' done')      
         return (assignmentPositions, assignmentPositions_boot)
     else:
         return assignmentPositions
@@ -298,14 +235,14 @@ def map_to_clusters(referenceCounts, referenceGenes,
                 clusterAssignment_matrix[cell, m] = mapsto_counts[i] / nrep
 
         if verbose:
-            for row in clusterAssignment_matrix:
+            for rownum,row in enumerate(clusterAssignment_matrix):
                 ind = np.argsort(row)[::-1]
                 ind = ind[:np.where(np.cumsum(row[ind]) >= until)[0][0] + 1]
                 mystring = []
                 for i in ind:
                     s = referenceClusterNames[i] + ' ({:.1f}%)'.format(100*row[i])
                     mystring.append(s)
-                mystring = cellNames[i] + ': ' + ', '.join(mystring)
+                mystring = cellNames[rownum] + ': ' + ', '.join(mystring)
                 print(mystring)
 
         if returnCmeans:

diff --git a/server-10xdata.py b/server-10xdata.py
@@ -0,0 +1,141 @@
+import numpy as np
+import pickle
+
+import sys
+sys.path.append('/gpfs01/berens/user/dkobak/FIt-SNE')
+from fast_tsne import fast_tsne
+
+
+# LOAD AND PREPROCESS THE DATA
+
+import scanpy.api as sc
+sc.settings.verbosity = 2
+
+# Data file is from here 
+# https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons
+adata = sc.read_10x_h5('big-data/10x/1M_neurons_filtered_gene_bc_matrices_h5.h5')
+sc.pp.recipe_zheng17(adata)
+
+X = np.copy(adata.X)
+X = X - X.mean(axis=0)
+U, s, V = np.linalg.svd(X, full_matrices=False)
+U[:, np.sum(V,axis=1)<0] *= -1
+X = np.dot(U, np.diag(s))
+X = X[:, np.argsort(s)[::-1]][:,:50]
+pickle.dump(X, open('big-pickles/10x-pca.pickle', 'wb'))
+
+# load cluster labels
+# https://github.com/theislab/scanpy_usage/blob/master/170522_visualizing_one_million_cells/results/louvain.csv.gz
+clusters = pd.read_csv('big-data/10x/louvain.csv', header=None).values[:,1].astype(int)
+
+
+# DOWNSAMPLE AND RUN t-SNE
+
+X = pickle.load(open('big-pickles/10x-pca.pickle', 'rb')).astype(float)
+PCAinit = X[:,:2] / np.std(X[:,0]) * 0.0001
+
+np.random.seed(42)
+ind25k  = np.random.choice(X.shape[0], 25000, replace=False)
+Z25k = fast_tsne(X[ind25k,:], perplexity_list=[30,int(25000/100)], 
+                 initialization = PCAinit[ind25k,:], seed=42, 
+                 learning_rate = 25000/12)
+pickle.dump([Z25k, []], open("big-pickles/10x-downsampling.pickle", "wb"))
+
+def downsampled_nn(X, Z, downsampled_ind, batchsize=1000, knn=10):
+	ind_rest = np.where(~np.isin(np.arange(X.shape[0]), downsampled_ind))[0]
+	steps = int(np.ceil(ind_rest.size/batchsize))
+	positions = np.zeros((X.shape[0], 2))
+	positions[downsampled_ind,:] = Z
+	def pdist2(A,B):
+		return np.sum(A**2,axis=1)[:, None] + np.sum(B**2, axis=1)[None, :] - 2 * A @ B.T
+
+	for i in range(steps):
+		print('.', end='', flush=True)
+		if (i+1)%100==0:
+			print('', flush=True)
+		endind = np.min(((i+1)*batchsize, ind_rest.size))
+		batch = ind_rest[i*batchsize:endind]
+		D = pdist2(X[batch, :], X[downsampled_ind,:])
+		ind = np.argpartition(D, knn)[:, :knn]
+		for i in range(batch.size):
+			positions[batch[i],:] = np.median(Z[ind[i,:],:], axis=0)		
+	print('', flush=True)
+	return positions
+
+%time positions = downsampled_nn(X, Z25k, ind25k, batchsize=10000) # 10 min
+pickle.dump([Z25k, positions], open("big-pickles/10x-downsampling.pickle", "wb"))
+
+
+# RUN T-SNE VARIANTS ON THE FULL DATA SET
+
+X = pickle.load(open('big-pickles/10x-pca.pickle', 'rb'))
+Z25k, positions = pickle.load(open('big-pickles/10x-downsampling.pickle', 'rb'))
+
+Zs = {}
+
+init25k = positions/np.std(positions[:,0]) * 0.0001
+%time Z = fast_tsne(X, perplexity=30, initialization=init25k, late_exag_coeff=4, start_late_exag_iter=250, learning_rate=X.shape[0]/12, seed=42, load_affinities='save') # 13 min 37 s
+Zs['mine'] = Z
+
+Z = fast_tsne(X, perplexity=30, initialization=init25k, learning_rate=X.shape[0]/12, seed=42, load_affinities='load') 
+Zs['noexagg'] = Z
+
+%time Z = fast_tsne(X, perplexity=30, initialization=PCAinit, late_exag_coeff=4, start_late_exag_iter=250, learning_rate=X.shape[0]/12, seed=42, load_affinities='load') 
+Zs['pcainit'] = Z
+
+Z = fast_tsne(X, perplexity=30, initialization=PCAinit, learning_rate=X.shape[0]/12, seed=42, load_affinities='load') 
+Zs['noexagg-pcainit'] = Z
+
+Z = fast_tsne(X, perplexity=30, late_exag_coeff=4, start_late_exag_iter=250,  learning_rate=X.shape[0]/12, seed=42, load_affinities='load') 
+Zs['randinit'] = Z
+
+Z = fast_tsne(X, perplexity=30, learning_rate=1000, seed=42, load_affinities='load') 
+Zs['scanpy'] = Z
+
+Z = fast_tsne(X, perplexity=30, learning_rate=X.shape[0]/12, seed=42, load_affinities='load') 
+Zs['belkina'] = Z
+
+Z = fast_tsne(X, perplexity=30, seed=42, load_affinities='load') 
+Zs['default'] = Z
+
+import umap
+%time Z = umap.UMAP(random_state=1).fit_transform(X) # 56 min
+Zs['umap'] = Z
+
+pickle.dump([Zs, clusters], open("big-pickles/10x-tsne.pickle", "wb"))
+
+
+# EXTRACT MARKER GENES
+
+import collections
+import scipy.sparse as sp_sparse
+import tables
+
+f = tables.open_file('big-data/10x/1M_neurons_filtered_gene_bc_matrices_h5.h5', 'r')
+group = f.get_node(f.root, 'mm10')
+gene_ids = getattr(group, 'genes').read()
+gene_names = getattr(group, 'gene_names').read().astype(str)
+barcodes = getattr(group, 'barcodes').read()
+data = getattr(group, 'data').read()
+indices = getattr(group, 'indices').read()
+indptr = getattr(group, 'indptr').read()
+shape = getattr(group, 'shape').read()
+
+matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape)
+
+markergenes = ['Snap25', 'Slc17a6', 'Slc17a7', 'Gad1', 'Gad2', 
+               'Slc32a1', 'Mog', 'Aqp4', 'Pdgfra', 'Itgam', 'Flt1', 
+               'Bgn', 'Olig1', 'Gja1', 'Xdh', 'Ctss', 'Myl9', 
+               'Vip', 'Sst', 'Pvalb', 'Nrn1', 'S1pr1', 'Gia1',
+               'Gjb6', 'Lcat', 'Acsbg1', 'Neurod6', 'Akap7', 
+               'Htr3a', 'Foxp2', 'Tubb23', 'Slc1a3', 'Top2a', 
+               'Stmn2', 'Meg3', 'Nrp1', 'Tac2', 'Reln', 'Pax6', 
+               'Tbr2', 'Tbr1', 'Eomes', 'Pax6', 'Tac1', 'Tubb3', 
+               'Stmn2', 'Sox2', 'Aldoc', 'Hes1']
+
+markerind   = np.array([i for i,g in enumerate(gene_names) if g in markergenes])
+markergenes = np.array([g for i,g in enumerate(gene_names) if g in markergenes])
+markerexp   = np.array(matrix[markerind,:].todense()).T.astype('float')
+
+pickle.dump([markergenes, markerexp], open("big-pickles/10x-markers.pickle", "wb"))
+