dpeerlab · awnimo · Sep 3, 2020 · Sep 2, 2020 · Sep 3, 2020
diff --git a/notebooks/Palantir_sample_notebook.ipynb b/notebooks/Palantir_sample_notebook.ipynb
diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@
         "matplotlib>=2.2.2",
         "seaborn>=0.8.1",
         "tzlocal",
-        "scanpy",
+        "scanpy>=1.6.0",
     ],
     extras_require={
         'PLOT_GENE_TRENDS': ["rpy2>=3.0.2"]

diff --git a/src/palantir/core.py b/src/palantir/core.py
@@ -316,7 +316,7 @@ def _construct_markov_chain(wp_data, knn, pseudotime, n_jobs):
     # Directed graph construction
     # pseudotime position of all the neighbors
     traj_nbrs = pd.DataFrame(
-        pseudotime[np.ravel(waypoints[ind])].values.reshape(
+        pseudotime[np.ravel(waypoints.values[ind])].values.reshape(
             [len(waypoints), n_neighbors]
         ),
         index=waypoints,

diff --git a/src/palantir/preprocess.py b/src/palantir/preprocess.py
@@ -2,6 +2,7 @@
 Functions for preprocessing of single cell RNA-seq counts
 """
 import numpy as np
+import scanpy as sc
 
 
 def filter_counts_data(data, cell_min_molecules=1000, genes_min_cells=10):
@@ -38,4 +39,7 @@ def log_transform(data, pseudo_count=0.1):
     :param data: Counts matrix: Cells x Genes
     :return: Log transformed matrix
     """
-    return np.log2(data + pseudo_count)
+    if type(data) is sc.AnnData:
+        data.X.data = np.log2(data.X.data + pseudo_count) - np.log2(pseudo_count)
+    else:
+        return np.log2(data + pseudo_count)
diff --git a/src/palantir/utils.py b/src/palantir/utils.py
@@ -3,23 +3,39 @@
 from MulticoreTSNE import MulticoreTSNE as TSNE
 import phenograph
 
-from sklearn.decomposition import PCA
 from scipy.sparse import csr_matrix, find, issparse
 from scipy.sparse.linalg import eigs
 import scanpy as sc
 
 
-def run_pca(data, n_components=300):
+def run_pca(data, n_components=300, use_hvg=True):
     """Run PCA
 
     :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
     :param n_components: Number of principal components
     :return: PCA projections of the data and the explained variance
     """
-    pca = PCA(n_components=n_components, svd_solver="randomized")
-    pca_projections = pca.fit_transform(data)
-    pca_projections = pd.DataFrame(pca_projections, index=data.index)
-    return pca_projections, pca.explained_variance_ratio_
+    if type(data) is sc.AnnData:
+        ad = data
+    else:
+        ad = sc.AnnData(data.values)
+
+    # Run PCA
+    if not use_hvg:
+        n_comps = n_components
+    else:
+        sc.pp.pca(ad, n_comps=1000, use_highly_variable=True, zero_center=False)
+        try:
+            n_comps = np.where(np.cumsum(ad.uns['pca']['variance_ratio']) > 0.85)[0][0]
+        except IndexError:
+            n_comps = n_components
+
+    # Rerun with selection number of components
+    sc.pp.pca(ad, n_comps=n_comps, use_highly_variable=use_hvg, zero_center=False)
+
+    # Return PCA projections if it is a dataframe
+    pca_projections = pd.DataFrame(ad.obsm['X_pca'], index=ad.obs_names)
+    return pca_projections, ad.uns['pca']['variance_ratio']
 
 
 def run_diffusion_maps(data_df, n_components=10, knn=30, alpha=0):
@@ -38,11 +54,7 @@ def run_diffusion_maps(data_df, n_components=10, knn=30, alpha=0):
         print("Determing nearest neighbor graph...")
         temp = sc.AnnData(data_df.values)
         sc.pp.neighbors(temp, n_pcs=0, n_neighbors=knn)
-        # maintaining backwards compatibility to Scanpy `sc.pp.neighbors`
-        try:
-            kNN = temp.uns["neighbors"]["distances"]
-        except KeyError:
-            kNN = temp.obsp['distances']
+        kNN = temp.obsp['distances']
 
         # Adaptive k
         adaptive_k = int(np.floor(knn / 3))
@@ -107,6 +119,9 @@ def run_magic_imputation(data, dm_res, n_steps=3):
     :param n_steps: Number of steps in the diffusion operator
     :return: Imputed data matrix
     """
+    if type(data) is sc.AnnData:
+        data = pd.DataFrame(data.X.todense(), index=data.obs_names, columns=data.var_names)
+
     T_steps = dm_res["T"] ** n_steps
     imputed_data = pd.DataFrame(
         np.dot(T_steps.todense(), data), index=data.index, columns=data.columns
@@ -161,6 +176,6 @@ def determine_cell_clusters(data, k=50):
     :return: Clusters
     """
     # Cluster and cluster centrolds
-    communities, _, _ = phenograph.cluster(data, k=k)
+    communities, _, _ = phenograph.cluster(data.values, k=k)
     communities = pd.Series(communities, index=data.index)
     return communities