diff --git a/examples/refactor/ALR.ipynb b/examples/refactor/ALR.ipynb new file mode 100644 index 0000000..97a957f --- /dev/null +++ b/examples/refactor/ALR.ipynb @@ -0,0 +1,378 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "461ff352", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e536cc44", + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload 1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "acadee5d", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import biom\n", + "import torch\n", + "import torch.nn as nn\n", + "%aimport mmvec.ALR\n", + "\n", + "import numpy as np\n", + "\n", + "torch.manual_seed(15)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "14d8bfab", + "metadata": {}, + "outputs": [], + "source": [ + "microbes = biom.load_table(\"./soil_microbes.biom\")\n", + "metabolites = biom.load_table(\"./soil_metabolites.biom\")\n", + "\n", + "model = mmvec.ALR.MMvecALR(microbes, metabolites, 15, sigma_u=1, sigma_v=1, holdout_num=5)\n", + "\n", + "microbes = microbes.to_dataframe().T\n", + "metabolites = metabolites.to_dataframe().T\n", + "microbes = microbes.loc[metabolites.index]\n", + "\n", + "microbe_idx = microbes.columns\n", + "metabolite_idx = metabolites.columns\n", + "microbes = torch.tensor(microbes.values, dtype=torch.int)\n", + "metabolites = torch.tensor(metabolites.values, dtype=torch.int64)\n", + "\n", + "microbe_relative_frequency = (microbes.T/microbes.sum(1)).T\n", + "\n", + "microbe_count = microbes.shape[1]\n", + "\n", + "metabolite_count = metabolites.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9003e38d", + "metadata": {}, + "outputs": [], + "source": [ + "#model = mmvec.ALR.MMvecALR(microbe_count, metabolite_count, 15, sigma_u=1, sigma_v=1)\n", + "learning_rate = 1e-3\n", + "batch_size = 500\n", + "epochs = 3000\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, maximize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29799ea5", + "metadata": {}, + "outputs": [], + "source": [ + "model.microbe_relative_freq(model.microbes_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b977e212", + "metadata": {}, + "outputs": [], + "source": [ + "maybe = mmvec.train.mmvec_training_loop(\n", + " model=model,\n", + "\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " learning_rate=learning_rate,\n", + " summary_interval = 25)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "30db58c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_stats = pd.Dataframe.from_records(maybe, )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2eb531fe", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f27301f", + "metadata": {}, + "outputs": [], + "source": [ + "model.microbe_relative_freq(model.microbes_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a16a24a", + "metadata": {}, + "outputs": [], + "source": [ + "l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3663ecb4", + "metadata": {}, + "outputs": [], + "source": [ + "# h = model.ranks_df - model.ranks_df.mean(axis=0)\n", + "h = model.ranks_matrix\n", + "\n", + "k = model.latent_dim" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b3bcb21", + "metadata": {}, + "outputs": [], + "source": [ + "h.mean(dim=0).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52e80094", + "metadata": {}, + "outputs": [], + "source": [ + "from torch.linalg import svd\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b24646d3", + "metadata": {}, + "outputs": [], + "source": [ + "u, s, v = svd(h, full_matrices=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afc2f855", + "metadata": {}, + "outputs": [], + "source": [ + "u.shape, s.shape, v.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b427a72", + "metadata": {}, + "outputs": [], + "source": [ + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "752acb7a", + "metadata": {}, + "outputs": [], + "source": [ + "model.U" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18824f01", + "metadata": {}, + "outputs": [], + "source": [ + "model.get_ordination()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5f97b6d", + "metadata": {}, + "outputs": [], + "source": [ + "ranks = pd.DataFrame(model.ranks(),\n", + " index=microbe_idx,\n", + " columns=metabolite_idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99a455e5", + "metadata": {}, + "outputs": [], + "source": [ + "ranks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe702e51", + "metadata": {}, + "outputs": [], + "source": [ + "for param in model.parameters():\n", + " print(param.shape)\n", + "\n", + "print(model.parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76a6f81b", + "metadata": {}, + "outputs": [], + "source": [ + "a = torch.randint(10, (9, 3), dtype=torch.float)\n", + "b = torch.randint(10, (3,))\n", + "a, a - a.mean(dim=0), a.mean(dim=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa9e8f31", + "metadata": {}, + "outputs": [], + "source": [ + "bp = model.get_ordination()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b850001", + "metadata": {}, + "outputs": [], + "source": [ + "import os.path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e3b2153", + "metadata": {}, + "outputs": [], + "source": [ + "bp_path = os.path.join(\"/Users/keeganevans/Desktop/\", \"biplot\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b18f859b", + "metadata": {}, + "outputs": [], + "source": [ + "bp.write(bp_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34569491", + "metadata": {}, + "outputs": [], + "source": [ + "model.ranks_df" + ] + }, + { + "cell_type": "markdown", + "id": "46d440f7", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/refactor/soil_metabolites.biom b/examples/refactor/soil_metabolites.biom new file mode 100644 index 0000000..4bd76fb Binary files /dev/null and b/examples/refactor/soil_metabolites.biom differ diff --git a/examples/refactor/soil_microbes.biom b/examples/refactor/soil_microbes.biom new file mode 100644 index 0000000..0791df7 Binary files /dev/null and b/examples/refactor/soil_microbes.biom differ diff --git a/mmvec/ALR.py b/mmvec/ALR.py new file mode 100644 index 0000000..30b8499 --- /dev/null +++ b/mmvec/ALR.py @@ -0,0 +1,275 @@ +from numpy import float64 +import pandas as pd + +import torch +from torch import linalg +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Multinomial, Normal + +from skbio import OrdinationResults + + +def structure_data(microbes, metabolites, holdout_num): + if type(microbes) is not pd.core.frame.DataFrame: + microbes = microbes.to_dataframe().T + if type(metabolites) is not pd.core.frame.DataFrame: + metabolites = metabolites.to_dataframe().T + # idx = microbes.index.intersection(metabolites.index) + # microbes = microbes.loc[idx] + # metabolites = metabolites.loc[idx] + #make sure none sum to zero + #microbes = microbes.loc[:, microbes.sum(axis=0) > 0] + # microbes = microbes.loc[microbes.sum(axis=1) > 0] + + microbe_idx = microbes.columns + metabolite_idx = metabolites.columns + + microbes_train = torch.tensor(microbes[:-holdout_num].values, + dtype=torch.float64) + metabolites_train = torch.tensor(metabolites[:-holdout_num].values, + dtype=torch.float64) + + microbes_test = torch.tensor(microbes[-holdout_num:].values, + dtype=torch.float64) + metabolites_test = torch.tensor(metabolites[-holdout_num:].values, + dtype=torch.float64) + microbe_count = microbes_train.shape[1] + metabolite_count = metabolites_train.shape[1] + + microbes = torch.tensor(microbes.values, dtype=torch.int) + metabolites = torch.tensor(metabolites.values, dtype=torch.int64) + + nnz = torch.count_nonzero(microbes).item() + + return (microbes_train, microbes_test, metabolites_train, + metabolites_test, microbe_idx, metabolite_idx, microbe_count, + metabolite_count, nnz) + + +class LinearALR(nn.Module): + def __init__(self, input_dim, output_dim): + super().__init__() + self.linear = nn.Linear(input_dim, output_dim - 1) + + def forward(self, x): + y = self.linear(x) + z = torch.zeros((y.shape[0], y.shape[1], 1)) + y = torch.cat((z, y), dim=2) + + return F.softmax(y, dim=2) + + +class MMvecALR(nn.Module): + def __init__(self, microbes, metabolites, latent_dim, sigma_u=1, + sigma_v=1, holdout_num=4): + super().__init__() + + # Data setup + (self.microbes_train, self.microbes_test, self.metabolites_train, + self.metabolites_test, self.microbe_idx, self.metabolite_idx, + self.num_microbes, self.num_metabolites, + self.nnz) = structure_data( + microbes, metabolites, holdout_num) + + self.sigma_u = sigma_u + self.sigma_v = sigma_v + self.latent_dim = latent_dim + # TODO: intialize same way as linear bias + self.encoder_bias = nn.parameter.Parameter( + torch.randn((self.num_microbes, 1))) + + self.encoder = nn.Embedding(self.num_microbes, self.latent_dim) + self.decoder = LinearALR(self.latent_dim, self.num_metabolites) + + + def forward(self, X, Y): + # Three likelihoods, the likelihood of each weight and the likelihood + # of the data fitting in the way that we thought + # LYs + z = self.encoder(X) + z = z + self.encoder_bias[X].reshape((*X.shape, 1)) + y_pred = self.decoder(z) + + predicted = torch.distributions.multinomial.Multinomial(total_count=0, + validate_args=False, + probs=y_pred) + + data_likelihood = predicted.log_prob(Y) + + l_y = data_likelihood.sum(0).mean() + + u_weights = self.encoder.weight + l_u = Normal(0, self.sigma_u).log_prob(u_weights).sum() + l_ubias = Normal(0, self.sigma_u).log_prob(self.encoder_bias).sum() + + v_weights = self.decoder.linear.weight + l_v = Normal(0, self.sigma_v).log_prob(v_weights).sum() + l_vbias = Normal(0, self.sigma_v).log_prob(self.decoder.linear.bias).sum() + + likelihood_sum = l_y + l_u + l_v + l_ubias + l_vbias + return likelihood_sum + + def get_ordination(self, equalize_biplot=False): + biplot = get_ordination_bare(self.ranks(), self.microbe_idx, + self.metabolite_idx, equalize_biplot=False) + return biplot + +# ranks = self.ranks() +# ranks = ranks - ranks.mean(dim=0) +# +# u, s_diag, v = linalg.svd(ranks, full_matrices=False) +# +# +# # us torch.diag to go from vector to matrix with the vector on dia +# if equalize_biplot: +# #microbe_embed = u @ torch.sqrt( +# # torch.diag(s_diag)).detach().numpy() +# microbe_embed = u @ torch.sqrt( +# torch.diag(s_diag)) +# microbe_embed = microbe_embed.detach().numpy() +# #metabolite_embed = v.T @ torch.sqrt(s_diag).detach().numpy() +# metabolite_embed = v.T @ torch.sqrt(s_diag) +# metabolite_embed = metabolite_embed.detach().numpy() +# else: +# microbe_embed = u @ torch.diag(s_diag) +# microbe_embed = microbe_embed.detach().numpy() +# metabolite_embed = v.T +# metabolite_embed = metabolite_embed.detach().numpy() +# +# pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] +# +# +# features = pd.DataFrame( +# microbe_embed, columns=pc_ids, index=self.microbe_idx) +# +# samples = pd.DataFrame(metabolite_embed, columns=pc_ids, +# index=self.metabolite_idx) +# +# short_method_name = 'mmvec biplot' +# long_method_name = 'Multiomics mmvec biplot' +# eigvals = pd.Series(s_diag, index=pc_ids) +# proportion_explained = pd.Series( +# torch.square(s_diag).detach().numpy() / torch.sum( +# torch.square(s_diag)).detach().numpy(), +# index=pc_ids, dtype=float64) +# +# biplot = OrdinationResults( +# short_method_name, long_method_name, eigvals, +# samples=samples, features=features, +# proportion_explained=proportion_explained) +# +# return biplot +# + + + @property + def u_bias(self): + #ensure consistent access + return self.encoder_bias.detach() + + @property + def v_bias(self): + #ensure consistent access + return self.decoder.linear.bias.detach() + + @property + def U(self): + U = torch.cat( + (torch.ones((self.encoder.weight.shape[0], 1)), + self.u_bias, + self.encoder.weight.detach()), + dim=1) + return U + + @property + def V(self): + V = torch.cat( + (self.v_bias.unsqueeze(dim=0), + torch.ones((1, self.decoder.linear.weight.shape[0] )), + self.decoder.linear.weight.detach().T), + dim=0) + return V + + def ranks_dataframe(self): + return pd.DataFrame(self.ranks(), index=self.microbe_idx, + columns=self.metabolite_idx) + + def ranks(self): + return ranks_bare(self.U, self.V) + #def ranks(self): + # # Adding the zeros is part of the inverse ALR. + # res = torch.cat(( + # torch.zeros((self.num_microbes, 1)), + # self.U @ self.V + # ), dim=1) + # res = res - res.mean(axis=1).reshape(-1, 1) + # return res + + def microbe_relative_freq(self,microbes): + return (microbes.T / microbes.sum(1)).T + + #def loss_fn(self, y_pred, observed): + + # predicted = torch.distributions.multinomial.Multinomial(total_count=0, + # validate_args=False, + # probs=y_pred) + # + # data_likelihood = predicted.log_prob(observed) + # l_y = data_likelihood.sum(1).mean() +### bare functions for testing/method creation + +def ranks_bare(u, v): + # Adding the zeros is part of the inverse ALR. + res = torch.cat(( + torch.zeros((u.shape[0], 1)), + u @ v + ), dim=1) + res = res - res.mean(axis=1).reshape(-1, 1) + return res + +def get_ordination_bare(ranks, microbe_index, metabolite_index, + equalize_biplot=False): + + ranks = ranks - ranks.mean(dim=0) + + u, s_diag, v = linalg.svd(ranks, full_matrices=False) + + + # us torch.diag to go from vector to matrix with the vector on dia + if equalize_biplot: + microbe_embed = u @ torch.sqrt( + torch.diag(s_diag)) + microbe_embed = microbe_embed.detach().numpy() + metabolite_embed = v.T @ torch.sqrt(s_diag) + metabolite_embed = metabolite_embed.detach().numpy() + else: + microbe_embed = u @ torch.diag(s_diag) + microbe_embed = microbe_embed.detach().numpy() + metabolite_embed = v.T + metabolite_embed = metabolite_embed.detach().numpy() + + pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] + + + features = pd.DataFrame( + microbe_embed, columns=pc_ids, index=microbe_index) + + samples = pd.DataFrame(metabolite_embed, columns=pc_ids, + index=metabolite_index) + + short_method_name = 'mmvec biplot' + long_method_name = 'Multiomics mmvec biplot' + eigvals = pd.Series(s_diag, index=pc_ids) + proportion_explained = pd.Series( + torch.square(s_diag).detach().numpy() / torch.sum( + torch.square(s_diag)).detach().numpy(), + index=pc_ids, dtype=float64) + + biplot = OrdinationResults( + short_method_name, long_method_name, eigvals, + samples=samples, features=features, + proportion_explained=proportion_explained) + + return biplot + diff --git a/mmvec/ILR.py b/mmvec/ILR.py new file mode 100644 index 0000000..1f5c132 --- /dev/null +++ b/mmvec/ILR.py @@ -0,0 +1,112 @@ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Multinomial, Normal + +import numpy as np + +from gneiss.cluster import random_linkage +from gneiss.balances import sparse_balance_basis + + + +class MMvecILR(nn.Module): + def __init__(self, num_microbes, num_metabolites, latent_dim, sigma_u, + sigma_v): + super().__init__() + + self.latent_dim = latent_dim + self.num_microbes = num_microbes + self.num_metabolites = num_metabolites + + self.u_bias = nn.parameter.Parameter(torch.randn((num_microbes, 1))) + + self.encoder = nn.Embedding(num_microbes, latent_dim) + #self.decoder = nn.Sequential( + # nn.Linear(latent_dim, num_metabolites), + # nn.Softmax(dim=2) + # ) + self.decoder = LinearILR(latent_dim, num_metabolites) + + self.sigma_u = sigma_u + self.sigma_v = sigma_v + + def forward(self, X, Y): + # Three likelihoods, the likelihood of each weight and the likelihood + # of the data fitting in the way that we thought + # LYs + z = self.encoder(X) + z = z + self.u_bias[X].reshape((*X.shape, 1)) + y_pred = self.decoder(z) + + forward_dist = Multinomial(total_count=0, + validate_args=False, + probs=y_pred) + + forward_dist = forward_dist.log_prob(Y) + + l_y = forward_dist.mean(0).mean() + + # LU + u_weights = self.encoder.weight + l_u = Normal(0, self.sigma_u).log_prob(u_weights).sum() + #l_u = torch.normal(0, self.sigma_u).log_prob(z + + # LV + # index zero currently holds "linear", may need to be changed later + v_weights = self.decoder.linear.weight + l_v = Normal(0, self.sigma_v).log_prob(v_weights).sum() + + likelihood_sum = l_y + l_u + l_v + return likelihood_sum + + + + def ILRranks(self): + #modelU = np.hstack( + # (np.ones((self.num_microbes, 1)), + # self.u_bias.detach().numpy(), + # self.encoder.weight.detach().numpy())) + + U = torch.cat( + (torch.from_numpy(np.ones((self.num_microbes, 1))), + self.u_bias.detach(), + self.encoder.weight.detach()), + dim=-1) + + V = torch.stack( + (self.decoder.linear.bias.detach(), + torch.from_numpy(np.ones((1, self.num_metabolites))), + self.decoder.linear.weight.detach().T), + dim=0) + + #V = torch.sparse.mm(modelV, self.decoder.Psi.T) + #res = modelU V + res = U @ V @ self.decoder.Psi.to_dense().T + #res = modelU @ modelV @ self.decoder.Psi.T + #print(res) + #res = res - res.mean(axis=1).reshape(-1, 1) + return res + + +class LinearILR(nn.Module): + def __init__(self, input_dim, output_dim): + super().__init__() + tree = random_linkage(output_dim) # pick random tree it doesn't really matter tbh + basis = sparse_balance_basis(tree)[0].copy() + indices = np.vstack((basis.row, basis.col)) + Psi = torch.sparse_coo_tensor( + indices.copy(), + basis.data.astype(np.float32).copy(), + dtype=torch.double, + requires_grad=False).coalesce() + + self.linear = nn.Linear(input_dim, output_dim) + self.register_buffer('Psi', Psi) + + def forward(self, x): + y = self.linear(x) + logy = (Psi.t() @ y.t()).t() + return F.softmax(logy, dim=1) + diff --git a/mmvec/__init__.py b/mmvec/__init__.py index d1646ad..97c8211 100644 --- a/mmvec/__init__.py +++ b/mmvec/__init__.py @@ -1,5 +1,9 @@ from .heatmap import _heatmap_choices, _cmaps +from .ALR import MMvecALR +from .ILR import MMvecILR +from .train import mmvec_training_loop __version__ = "1.0.6" -__all__ = ['_heatmap_choices', '_cmaps'] +__all__ = ['_heatmap_choices', '_cmaps', 'MMvecALR', 'MMvecILR', + 'mmvec_training_loop'] diff --git a/mmvec/multimodal.py b/mmvec/multimodal.py index eb54ded..2c7017b 100644 --- a/mmvec/multimodal.py +++ b/mmvec/multimodal.py @@ -4,6 +4,7 @@ import numpy as np import tensorflow as tf from tensorflow.contrib.distributions import Multinomial, Normal +from mmvec.ALR import MMvecALR import datetime diff --git a/mmvec/q2/_method.py b/mmvec/q2/_method.py index bb6dfbf..e395a7d 100644 --- a/mmvec/q2/_method.py +++ b/mmvec/q2/_method.py @@ -1,11 +1,11 @@ import biom import pandas as pd import numpy as np -import tensorflow as tf from skbio import OrdinationResults import qiime2 from qiime2.plugin import Metadata -from mmvec.multimodal import MMvec +from mmvec.train import mmvec_training_loop +from mmvec.ALR import MMvecALR from mmvec.util import split_tables from scipy.sparse import coo_matrix from scipy.sparse.linalg import svds @@ -13,7 +13,7 @@ def paired_omics(microbes: biom.Table, metabolites: biom.Table, - metadata: Metadata = None, + metadata: qiime2.Metadata = None, training_column: str = None, num_testing_examples: int = 5, min_feature_count: int = 10, @@ -26,12 +26,13 @@ def paired_omics(microbes: biom.Table, equalize_biplot: float = False, arm_the_gpu: bool = False, summary_interval: int = 60) -> ( - pd.DataFrame, OrdinationResults, qiime2.Metadata - ): + pd.DataFrame, OrdinationResults, qiime2.Metadata + ): if metadata is not None: metadata = metadata.to_dataframe() + #TODO refactor for pytorch! if arm_the_gpu: # pick out the first GPU device_name = '/device:GPU:0' @@ -54,73 +55,26 @@ def paired_omics(microbes: biom.Table, train_microbes_coo = coo_matrix(train_microbes_df.values) test_microbes_coo = coo_matrix(test_microbes_df.values) - with tf.Graph().as_default(), tf.Session() as session: - model = MMvec( - latent_dim=latent_dim, - u_scale=input_prior, v_scale=output_prior, - batch_size=batch_size, - device_name=device_name, - learning_rate=learning_rate) - model(session, - train_microbes_coo, train_metabolites_df.values, - test_microbes_coo, test_metabolites_df.values) - - loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval) - ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns, - columns=train_metabolites_df.columns) - if latent_dim > 0: - u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim) - else: - # fake it until you make it - u, s, v = svds(ranks - ranks.mean(axis=0), k=1) - - ranks = ranks.T - ranks.index.name = 'featureid' - s = s[::-1] - u = u[:, ::-1] - v = v[::-1, :] - if equalize_biplot: - microbe_embed = u @ np.sqrt(np.diag(s)) - metabolite_embed = v.T @ np.sqrt(np.diag(s)) - else: - microbe_embed = u @ np.diag(s) - metabolite_embed = v.T - - pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] - features = pd.DataFrame( - microbe_embed, columns=pc_ids, - index=train_microbes_df.columns) - samples = pd.DataFrame( - metabolite_embed, columns=pc_ids, - index=train_metabolites_df.columns) - short_method_name = 'mmvec biplot' - long_method_name = 'Multiomics mmvec biplot' - eigvals = pd.Series(s, index=pc_ids) - proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) - biplot = OrdinationResults( - short_method_name, long_method_name, eigvals, - samples=samples, features=features, - proportion_explained=proportion_explained) - - its = np.arange(len(loss)) - convergence_stats = pd.DataFrame( - { - 'loss': loss, - 'cross-validation': cv, - 'iteration': its - } + model = MMvecALR( + microbes=microbes, + metabolites= metabolites, + latent_dim=latent_dim, + sigma_u=input_prior, sigma_v=output_prior, ) - convergence_stats.index.name = 'id' - convergence_stats.index = convergence_stats.index.astype(np.str) + convergence_stats = pd.DataFrame.from_records(mmvec_training_loop(model=model, + learning_rate=learning_rate, epochs=epochs, batch_size=batch_size, + summary_interval=summary_interval) + , + columns=['iteration','loss', 'cross-validation']) - c = convergence_stats['loss'].astype(np.float) - convergence_stats['loss'] = c - c = convergence_stats['cross-validation'].astype(np.float) - convergence_stats['cross-validation'] = c + convergence_stats.astype({'loss': 'float', 'cross-validation': + 'float'}, copy=False) - c = convergence_stats['iteration'].astype(np.int) - convergence_stats['iteration'] = c + convergence_stats.set_index("iteration", inplace=True) + convergence_stats.index.name="id" - return ranks, biplot, qiime2.Metadata(convergence_stats) + biplot = model.get_ordination() + ranks = model.ranks_dataframe() + return ranks, biplot, qiime2.Metadata(convergence_stats) diff --git a/mmvec/q2/tests/test_functions.py b/mmvec/q2/tests/test_functions.py new file mode 100644 index 0000000..c4ba9bc --- /dev/null +++ b/mmvec/q2/tests/test_functions.py @@ -0,0 +1,17 @@ +import unittest +from numpy.testing._private.utils import assert_equal +import torch +from mmvec.ALR import ranks_bare + +class TestBareFunctions(unittest.TestCase): + def setUp(self) -> None: + self.u = torch.rand(4, 5) + self.v = torch.rand(5, 3) + return super().setUp() + + def test_ranks(self): + ranks = ranks_bare(self.u, self.v) + print(ranks) + + assert_equal(ranks.shape[0], self.u.shape[0]) + assert_equal(ranks.shape[1], (self.v.shape[1] + 1)) diff --git a/mmvec/q2/tests/test_method.py b/mmvec/q2/tests/test_method.py index 2bae849..9062394 100644 --- a/mmvec/q2/tests/test_method.py +++ b/mmvec/q2/tests/test_method.py @@ -1,7 +1,6 @@ import biom import unittest import numpy as np -import tensorflow as tf from mmvec.q2._method import paired_omics from mmvec.util import random_multimodal from skbio.stats.composition import clr_inv @@ -11,6 +10,22 @@ class TestMMvec(unittest.TestCase): + #def setUp(self): + # # build small simulation + # np.random.seed(1) + # res = random_multimodal( + # num_microbes=8, num_metabolites=8, num_samples=150, + # latent_dim=2, sigmaQ=2, + # microbe_total=1000, metabolite_total=10000, seed=1 + # ) + # (self.microbes, self.metabolites, self.X, self.B, + # self.U, self.Ubias, self.V, self.Vbias) = res + # num_train = 10 + # self.trainX = self.microbes.iloc[:-num_train] + # self.testX = self.microbes.iloc[-num_train:] + # self.trainY = self.metabolites.iloc[:-num_train] + # self.testY = self.metabolites.iloc[-num_train:] + def setUp(self): np.random.seed(1) res = random_multimodal( @@ -40,8 +55,8 @@ def setUp(self): def test_fit(self): np.random.seed(1) - tf.reset_default_graph() - tf.set_random_seed(0) + #tf.reset_default_graph() + #tf.set_random_seed(0) latent_dim = 2 res_ranks, res_biplot, _ = paired_omics( self.microbes, self.metabolites, @@ -70,8 +85,8 @@ def test_fit(self): def test_equalize_sv(self): np.random.seed(1) - tf.reset_default_graph() - tf.set_random_seed(0) + #tf.reset_default_graph() + #tf.set_random_seed(0) latent_dim = 2 res_ranks, res_biplot, _ = paired_omics( self.microbes, self.metabolites, diff --git a/mmvec/tests/test_multimodal.py b/mmvec/tests/test_multimodal.py index d98f4c6..1d809da 100644 --- a/mmvec/tests/test_multimodal.py +++ b/mmvec/tests/test_multimodal.py @@ -9,13 +9,12 @@ from scipy.stats import spearmanr from scipy.sparse import coo_matrix from scipy.spatial.distance import pdist -from mmvec.multimodal import MMvec +from mmvec.ALR import MMvecALR +from mmvec.train import mmvec_training_loop from mmvec.util import random_multimodal -from tensorflow import set_random_seed -import tensorflow as tf -class TestMMvec(unittest.TestCase): +class TestALR(unittest.TestCase): def setUp(self): # build small simulation np.random.seed(1) @@ -39,136 +38,143 @@ def tearDown(self): def test_fit(self): np.random.seed(1) - tf.reset_default_graph() + #tf.reset_default_graph() n, d1 = self.trainX.shape n, d2 = self.trainY.shape - with tf.Graph().as_default(), tf.Session() as session: - set_random_seed(0) - model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2) - model(session, - coo_matrix(self.trainX.values), self.trainY.values, - coo_matrix(self.testX.values), self.testY.values) - model.fit(epoch=1000) - - U_ = np.hstack( - (np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) - V_ = np.vstack( - (self.Vbias, np.ones((1, self.V.shape[1])), self.V)) - - u_r, u_p = spearmanr(pdist(model.U), pdist(self.U)) - v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T)) - - res = softmax(model.ranks()) - exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_))) - s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp)) - - self.assertGreater(u_r, 0.5) - self.assertGreater(v_r, 0.5) - self.assertGreater(s_r, 0.5) - self.assertLess(u_p, 5e-2) - self.assertLess(v_p, 5e-2) - self.assertLess(s_p, 5e-2) - - # sanity check cross validation - self.assertLess(model.cv.eval(), 500) - - -class TestMMvecSoilsBenchmark(unittest.TestCase): - def setUp(self): - self.microbes = load_table(get_data_path('soil_microbes.biom')) - self.metabolites = load_table(get_data_path('soil_metabolites.biom')) - X = self.microbes.to_dataframe().T - Y = self.metabolites.to_dataframe().T - X = X.loc[Y.index] - self.trainX = X.iloc[:-2] - self.trainY = Y.iloc[:-2] - self.testX = X.iloc[-2:] - self.testY = Y.iloc[-2:] - - def tearDown(self): - # remove all log directories - for r in glob.glob("logdir*"): - shutil.rmtree(r) - - def test_soils(self): - np.random.seed(1) - tf.reset_default_graph() - n, d1 = self.trainX.shape - n, d2 = self.trainY.shape - - with tf.Graph().as_default(), tf.Session() as session: - set_random_seed(0) - model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=1, - learning_rate=1e-3) - model(session, - coo_matrix(self.trainX.values), self.trainY.values, - coo_matrix(self.testX.values), self.testY.values) - model.fit(epoch=1000) - - ranks = pd.DataFrame( - model.ranks(), - index=self.microbes.ids(axis='observation'), - columns=self.metabolites.ids(axis='observation')) - - microcoleus_metabolites = [ - '(3-methyladenine)', '7-methyladenine', '4-guanidinobutanoate', - 'uracil', 'xanthine', 'hypoxanthine', '(N6-acetyl-lysine)', - 'cytosine', 'N-acetylornithine', 'N-acetylornithine', - 'succinate', 'adenosine', 'guanine', 'adenine'] - mprobs = ranks.loc['rplo 1 (Cyanobacteria)'] - self.assertEqual(np.sum(mprobs.loc[microcoleus_metabolites] > 0), - len(microcoleus_metabolites)) - - -class TestMMvecBenchmark(unittest.TestCase): - def setUp(self): - # build small simulation - res = random_multimodal( - num_microbes=100, num_metabolites=1000, num_samples=300, - latent_dim=2, sigmaQ=2, - microbe_total=5000, metabolite_total=10000, seed=1 - ) - (self.microbes, self.metabolites, self.X, self.B, - self.U, self.Ubias, self.V, self.Vbias) = res - num_train = 10 - self.trainX = self.microbes.iloc[:-num_train] - self.testX = self.microbes.iloc[-num_train:] - self.trainY = self.metabolites.iloc[:-num_train] - self.testY = self.metabolites.iloc[-num_train:] - - @unittest.skip("Only for benchmarking") - def test_gpu(self): - np.random.seed(1) - tf.reset_default_graph() - n, d1 = self.trainX.shape - n, d2 = self.trainY.shape - - with tf.Graph().as_default(), tf.Session() as session: - set_random_seed(0) - model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2, - batch_size=2000, - device_name="/device:GPU:0") - model(session, - coo_matrix(self.trainX.values), self.trainY.values, - coo_matrix(self.testX.values), self.testY.values) - model.fit(epoch=10000) - - @unittest.skip("Only for benchmarking") - def test_cpu(self): - print('CPU run') - np.random.seed(1) - tf.reset_default_graph() - n, d1 = self.trainX.shape - n, d2 = self.trainY.shape - - with tf.Graph().as_default(), tf.Session() as session: - set_random_seed(0) - model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2, - batch_size=2000) - model(session, - coo_matrix(self.trainX.values), self.trainY.values, - coo_matrix(self.testX.values), self.testY.values) - model.fit(epoch=10000) + model = MMvecALR(self.trainX, self.trainY, latent_dim=2) + mmvec_training_loop(model=model, learning_rate=0.01, batch_size=50, + epochs=500, summary_interval=100) + U_ = np.hstack( + (np.ones((self.U.shape[0], 1)), self.Ubias, self.U)) + V_ = np.vstack( + (self.Vbias, np.ones((1, self.V.shape[1])), self.V)) + + + res = softmax(model.ranks().numpy()) + exp = softmax(np.hstack((np.zeros((d1, 1)), U_ @ V_))) + + s_r, s_p = spearmanr(np.ravel(res), np.ravel(exp)) + + u_r, u_p = spearmanr(pdist(model.U), pdist(self.U)) + v_r, v_p = spearmanr(pdist(model.V.T), pdist(self.V.T)) + + self.assertGreater(u_r, 0.5) + self.assertGreater(v_r, 0.5) + self.assertGreater(s_r, 0.5) + self.assertLess(u_p, 5e-2) + self.assertLess(v_p, 5e-2) + self.assertLess(s_p, 5e-2) + + + assert False +# with tf.Graph().as_default(), tf.Session() as session: +# set_random_seed(0) +# model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2) +# model(session, +# coo_matrix(self.trainX.values), self.trainY.values, +# coo_matrix(self.testX.values), self.testY.values) +# model.fit(epoch=1000) +# +# +# # sanity check cross validation +# self.assertLess(model.cv.eval(), 500) + + +#class TestMMvecSoilsBenchmark(unittest.TestCase): +# def setUp(self): +# self.microbes = load_table(get_data_path('soil_microbes.biom')) +# self.metabolites = load_table(get_data_path('soil_metabolites.biom')) +# X = self.microbes.to_dataframe().T +# Y = self.metabolites.to_dataframe().T +# X = X.loc[Y.index] +# self.trainX = X.iloc[:-2] +# self.trainY = Y.iloc[:-2] +# self.testX = X.iloc[-2:] +# self.testY = Y.iloc[-2:] +# +# def tearDown(self): +# # remove all log directories +# for r in glob.glob("logdir*"): +# shutil.rmtree(r) + +# def test_soils(self): +# np.random.seed(1) +# n, d1 = self.trainX.shape +# n, d2 = self.trainY.shape +# +# with tf.Graph().as_default(), tf.Session() as session: +# set_random_seed(0) +# model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=1, +# learning_rate=1e-3) +# model(session, +# coo_matrix(self.trainX.values), self.trainY.values, +# coo_matrix(self.testX.values), self.testY.values) +# model.fit(epoch=1000) +# +# ranks = pd.DataFrame( +# model.ranks(), +# index=self.microbes.ids(axis='observation'), +# columns=self.metabolites.ids(axis='observation')) +# +# microcoleus_metabolites = [ +# '(3-methyladenine)', '7-methyladenine', '4-guanidinobutanoate', +# 'uracil', 'xanthine', 'hypoxanthine', '(N6-acetyl-lysine)', +# 'cytosine', 'N-acetylornithine', 'N-acetylornithine', +# 'succinate', 'adenosine', 'guanine', 'adenine'] +# mprobs = ranks.loc['rplo 1 (Cyanobacteria)'] +# self.assertEqual(np.sum(mprobs.loc[microcoleus_metabolites] > 0), +# len(microcoleus_metabolites)) +# + +#class TestMMvecBenchmark(unittest.TestCase): +# def setUp(self): +# # build small simulation +# res = random_multimodal( +# num_microbes=100, num_metabolites=1000, num_samples=300, +# latent_dim=2, sigmaQ=2, +# microbe_total=5000, metabolite_total=10000, seed=1 +# ) +# (self.microbes, self.metabolites, self.X, self.B, +# self.U, self.Ubias, self.V, self.Vbias) = res +# num_train = 10 +# self.trainX = self.microbes.iloc[:-num_train] +# self.testX = self.microbes.iloc[-num_train:] +# self.trainY = self.metabolites.iloc[:-num_train] +# self.testY = self.metabolites.iloc[-num_train:] +# +# @unittest.skip("Only for benchmarking") +# def test_gpu(self): +# np.random.seed(1) +# tf.reset_default_graph() +# n, d1 = self.trainX.shape +# n, d2 = self.trainY.shape +# +# with tf.Graph().as_default(), tf.Session() as session: +# set_random_seed(0) +# model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2, +# batch_size=2000, +# device_name="/device:GPU:0") +# model(session, +# coo_matrix(self.trainX.values), self.trainY.values, +# coo_matrix(self.testX.values), self.testY.values) +# model.fit(epoch=10000) + + #@unittest.skip("Only for benchmarking") + #def test_cpu(self): + # print('CPU run') + # np.random.seed(1) + # tf.reset_default_graph() + # n, d1 = self.trainX.shape + # n, d2 = self.trainY.shape + + # with tf.Graph().as_default(), tf.Session() as session: + # set_random_seed(0) + # model = MMvec(beta_1=0.8, beta_2=0.9, latent_dim=2, + # batch_size=2000) + # model(session, + # coo_matrix(self.trainX.values), self.trainY.values, + # coo_matrix(self.testX.values), self.testY.values) + # model.fit(epoch=10000) if __name__ == "__main__": diff --git a/mmvec/train.py b/mmvec/train.py new file mode 100644 index 0000000..f6b7123 --- /dev/null +++ b/mmvec/train.py @@ -0,0 +1,38 @@ +import torch + +def mmvec_training_loop(model, learning_rate, batch_size, epochs, + summary_interval): + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, + betas=(0.8, 0.9), maximize=True) + for epoch in range(epochs): + batch_iterations = int(model.nnz / batch_size) + + for batch in range(batch_iterations): + #iteration = epoch*batch_iterations + batch + 1 + iteration = epoch * model.nnz // batch_size + + draws = torch.multinomial( + model.microbe_relative_freq(model.microbes_train), + batch_size, + replacement=True).T + + loss = model(draws, model.metabolites_train) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + + + + + + if epoch % summary_interval == 0: + + with torch.no_grad(): + cv_draw = torch.multinomial( + model.microbe_relative_freq(model.microbes_test), + batch_size, + replacement=True).T + cv_loss = model(cv_draw, model.metabolites_test) + yield (str(iteration), loss.item(), cv_loss.item()) diff --git a/scripts/mmvec b/scripts/mmvec index 4976c99..0fd8822 100644 --- a/scripts/mmvec +++ b/scripts/mmvec @@ -14,7 +14,6 @@ from skbio.stats.composition import clr_inv as softmax from scipy.stats import entropy, spearmanr from scipy.sparse import coo_matrix from scipy.sparse.linalg import svds -import tensorflow as tf from tensorflow.contrib.distributions import Multinomial, Normal from mmvec.multimodal import MMvec from mmvec.util import split_tables, format_params @@ -157,76 +156,76 @@ def paired_omics(microbe_file, metabolite_file, else: device_name='/cpu:0' - config = tf.ConfigProto() - with tf.Graph().as_default(), tf.Session(config=config) as session: - model = MMvec( - latent_dim=latent_dim, - u_scale=input_prior, v_scale=output_prior, - learning_rate = learning_rate, - beta_1=beta1, beta_2=beta2, - device_name=device_name, - batch_size=batch_size, - clipnorm=clipnorm, save_path=sname) - - model(session, - train_microbes_coo, train_metabolites_df.values, - test_microbes_coo, test_metabolites_df.values) - - loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval, - checkpoint_interval=checkpoint_interval) - - pc_ids = list(range(latent_dim)) - vdim = model.V.shape[0] - V = np.hstack((np.zeros((vdim, 1)), model.V)) - V = V.T - Vbias = np.hstack((np.zeros(1), model.Vbias.ravel())) - - # Save to an embeddings file - Uparam = format_params(model.U, pc_ids, list(train_microbes_df.columns), 'microbe') - Vparam = format_params(V, pc_ids, list(train_metabolites_df.columns), 'metabolite') - df = pd.concat( - ( - Uparam, Vparam, - format_params(model.Ubias, ['bias'], train_microbes_df.columns, 'microbe'), - format_params(Vbias, ['bias'], train_metabolites_df.columns, 'metabolite') - ), axis=0) - - df.to_csv(embeddings_file, sep='\t') - - # Save to a ranks file - ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns, - columns=train_metabolites_df.columns) - - u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim) - ranks = ranks.T - ranks.index.name = 'featureid' - ranks.to_csv(ranks_file, sep='\t') - # Save to an ordination file - s = s[::-1] - u = u[:, ::-1] - v = v[::-1, :] - if equalize_biplot: - microbe_embed = u @ np.sqrt(np.diag(s)) - metabolite_embed = v.T @ np.sqrt(np.diag(s)) - else: - microbe_embed = u @ np.diag(s) - metabolite_embed = v.T - pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] - features = pd.DataFrame( - microbe_embed, columns=pc_ids, - index=train_microbes_df.columns) - samples = pd.DataFrame( - metabolite_embed, columns=pc_ids, - index=train_metabolites_df.columns) - short_method_name = 'mmvec biplot' - long_method_name = 'Multiomics mmvec biplot' - eigvals = pd.Series(s, index=pc_ids) - proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) - biplot = OrdinationResults( - short_method_name, long_method_name, eigvals, - samples=samples, features=features, - proportion_explained=proportion_explained) - biplot.write(ordination_file) + #config = tf.ConfigProto() + #with tf.Graph().as_default(), tf.Session(config=config) as session: + model = MMvec( + latent_dim=latent_dim, + u_scale=input_prior, v_scale=output_prior, + learning_rate = learning_rate, + beta_1=beta1, beta_2=beta2, + device_name=device_name, + batch_size=batch_size, + clipnorm=clipnorm, save_path=sname) + + model(session, + train_microbes_coo, train_metabolites_df.values, + test_microbes_coo, test_metabolites_df.values) + + loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval, + checkpoint_interval=checkpoint_interval) + + pc_ids = list(range(latent_dim)) + vdim = model.V.shape[0] + V = np.hstack((np.zeros((vdim, 1)), model.V)) + V = V.T + Vbias = np.hstack((np.zeros(1), model.Vbias.ravel())) + + # Save to an embeddings file + Uparam = format_params(model.U, pc_ids, list(train_microbes_df.columns), 'microbe') + Vparam = format_params(V, pc_ids, list(train_metabolites_df.columns), 'metabolite') + df = pd.concat( + ( + Uparam, Vparam, + format_params(model.Ubias, ['bias'], train_microbes_df.columns, 'microbe'), + format_params(Vbias, ['bias'], train_metabolites_df.columns, 'metabolite') + ), axis=0) + + df.to_csv(embeddings_file, sep='\t') + + # Save to a ranks file + ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns, + columns=train_metabolites_df.columns) + + u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim) + ranks = ranks.T + ranks.index.name = 'featureid' + ranks.to_csv(ranks_file, sep='\t') + # Save to an ordination file + s = s[::-1] + u = u[:, ::-1] + v = v[::-1, :] + if equalize_biplot: + microbe_embed = u @ np.sqrt(np.diag(s)) + metabolite_embed = v.T @ np.sqrt(np.diag(s)) + else: + microbe_embed = u @ np.diag(s) + metabolite_embed = v.T + pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])] + features = pd.DataFrame( + microbe_embed, columns=pc_ids, + index=train_microbes_df.columns) + samples = pd.DataFrame( + metabolite_embed, columns=pc_ids, + index=train_metabolites_df.columns) + short_method_name = 'mmvec biplot' + long_method_name = 'Multiomics mmvec biplot' + eigvals = pd.Series(s, index=pc_ids) + proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids) + biplot = OrdinationResults( + short_method_name, long_method_name, eigvals, + samples=samples, features=features, + proportion_explained=proportion_explained) + biplot.write(ordination_file) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 044f522..d568f26 100644 --- a/setup.py +++ b/setup.py @@ -53,14 +53,14 @@ scripts=glob('scripts/mmvec'), install_requires=[ 'biom-format', - 'numpy >= 1.9.2', - 'pandas <= 0.25.3', - 'scipy >= 0.15.1', - 'nose >= 1.3.7', - 'scikit-bio >= 0.5.1', + 'numpy', + 'pandas', + 'scipy', + 'nose', + 'scikit-bio', 'seaborn', 'tqdm', - 'tensorflow>=1.15,<2' + 'torch>=1.9.0' ], classifiers=classifiers, entry_points={