diff --git a/deeptools/heatmapper.py b/deeptools/heatmapper.py index 5badcac0d..6a95ba005 100644 --- a/deeptools/heatmapper.py +++ b/deeptools/heatmapper.py @@ -991,7 +991,10 @@ def save_matrix_values(self, file_name): def save_BED(self, file_handle): boundaries = np.array(self.matrix.group_boundaries) # Add a header - file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group\n") + file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group") + if self.matrix.silhouette is not None: + file_handle.write("\tsilhouette") + file_handle.write("\n") for idx, region in enumerate(self.matrix.regions): # the label id corresponds to the last boundary # that is smaller than the region index. @@ -1013,11 +1016,14 @@ def save_BED(self, file_handle): region[5], region[4])) file_handle.write( - '\t{0}\t{1}\t{2}\t{3}\n'.format( + '\t{0}\t{1}\t{2}\t{3}'.format( len(region[1]), ",".join([str(int(y) - int(x)) for x, y in region[1]]), ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]), self.matrix.group_labels[label_idx])) + if self.matrix.silhouette is not None: + file_handle.write("\t{}".format(self.matrix.silhouette[idx])) + file_handle.write("\n") file_handle.close() @staticmethod @@ -1053,6 +1059,23 @@ def get_num_individual_matrix_cols(self): return matrixCols +def computeSilhouetteScore(d, idx, labels): + """ + Given a square distance matrix with NaN diagonals, compute the silhouette score + of a given row (idx). Each row should have an associated label (labels). + """ + keep = ~np.isnan(d[idx, ]) + foo = np.bincount(labels[keep], weights=d[idx, ][keep]) + groupSizes = np.bincount(labels[keep]) + intraIdx = labels[idx] + if groupSizes[intraIdx] == 1: + return 0 + intra = foo[labels[idx]] / groupSizes[intraIdx] + interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]] + inter = np.min(foo[interMask] / groupSizes[interMask]) + return (inter - intra) / max(inter, intra) + + class _matrix(object): """ class to hold heatmapper matrices @@ -1082,6 +1105,7 @@ def __init__(self, regions, matrix, group_boundaries, sample_boundaries, self.sample_boundaries = sample_boundaries self.sort_method = None self.sort_using = None + self.silhouette = None if group_labels is None: self.group_labels = ['group {}'.format(x) @@ -1225,7 +1249,7 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): self.regions = _sorted_regions self.set_sorting_method(sort_method, sort_using) - def hmcluster(self, k, method='kmeans', clustering_samples=None): + def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None): matrix = np.asarray(self.matrix) matrix_to_cluster = matrix if clustering_samples is not None: @@ -1295,8 +1319,25 @@ def hmcluster(self, k, method='kmeans', clustering_samples=None): self.regions = _clustered_regions self.matrix = np.vstack(_clustered_matrix) + return idx + def computeSilhouette(self, k): + if k > 1: + from scipy.spatial.distance import pdist, squareform + + silhouette = np.repeat(0.0, self.group_boundaries[-1]) + groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1]) + labels = np.repeat(np.arange(k), groupSizes) + + d = pdist(self.matrix) + d2 = squareform(d) + np.fill_diagonal(d2, np.nan) # This excludes the diagonal + for idx in range(len(labels)): + silhouette[idx] = computeSilhouetteScore(d2, idx, labels) + sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette))) + self.silhouette = silhouette + def removeempty(self): """ removes matrix rows containing only zeros or nans diff --git a/deeptools/parserCommon.py b/deeptools/parserCommon.py index 073953126..c76eb603f 100644 --- a/deeptools/parserCommon.py +++ b/deeptools/parserCommon.py @@ -505,6 +505,17 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]): 'fail with an error if a cluster has very few members compared to the ' 'total number of regions.', type=int) + cluster.add_argument( + '--silhouette', + help='Compute the silhouette score for regions. This is only' + ' applicable if clustering has been performed. The silhouette score' + ' is a measure of how similar a region is to other regions in the' + ' same cluster as opposed to those in other clusters. It will be reported' + ' in the final column of the BED file with regions. The ' + 'silhouette evaluation can be very slow when you have more' + 'than 100 000 regions.', + action='store_true' + ) optional = parser.add_argument_group('Optional arguments') diff --git a/deeptools/plotHeatmap.py b/deeptools/plotHeatmap.py index e73ca377b..8bd6b42e6 100644 --- a/deeptools/plotHeatmap.py +++ b/deeptools/plotHeatmap.py @@ -780,14 +780,11 @@ def main(args=None): if args.sortRegions == 'keep': args.sortRegions = 'no' # These are the same thing if args.kmeans is not None: - hm.matrix.hmcluster(args.kmeans, method='kmeans', - clustering_samples=args.clusterUsingSamples) - else: - if args.hclust is not None: - print("Performing hierarchical clustering." - "Please note that it might be very slow for large datasets.\n") - hm.matrix.hmcluster(args.hclust, method='hierarchical', - clustering_samples=args.clusterUsingSamples) + hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples) + elif args.hclust is not None: + print("Performing hierarchical clustering." + "Please note that it might be very slow for large datasets.\n") + hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples) group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): @@ -816,6 +813,12 @@ def main(args=None): sort_method=args.sortRegions, sample_list=sortUsingSamples) + if args.silhouette: + if args.kmeans is not None: + hm.matrix.computeSilhouette(args.kmeans) + elif args.hclust is not None: + hm.matrix.computeSilhouette(args.args.hclust) + if args.outFileNameMatrix: hm.save_matrix_values(args.outFileNameMatrix) diff --git a/galaxy/wrapper/deepTools_macros.xml b/galaxy/wrapper/deepTools_macros.xml index 8547c77ef..9b39a300b 100644 --- a/galaxy/wrapper/deepTools_macros.xml +++ b/galaxy/wrapper/deepTools_macros.xml @@ -164,6 +164,12 @@ + @@ -181,6 +187,7 @@ --hclust $advancedOpt.used_multiple_regions.clustering.n_hclust #end if #end if + $advancedOpt.used_multiple_regions.silhouette #end if