diff --git a/deeptools/heatmapper.py b/deeptools/heatmapper.py
index 5badcac0d..6a95ba005 100644
--- a/deeptools/heatmapper.py
+++ b/deeptools/heatmapper.py
@@ -991,7 +991,10 @@ def save_matrix_values(self, file_name):
def save_BED(self, file_handle):
boundaries = np.array(self.matrix.group_boundaries)
# Add a header
- file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group\n")
+ file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
+ if self.matrix.silhouette is not None:
+ file_handle.write("\tsilhouette")
+ file_handle.write("\n")
for idx, region in enumerate(self.matrix.regions):
# the label id corresponds to the last boundary
# that is smaller than the region index.
@@ -1013,11 +1016,14 @@ def save_BED(self, file_handle):
region[5],
region[4]))
file_handle.write(
- '\t{0}\t{1}\t{2}\t{3}\n'.format(
+ '\t{0}\t{1}\t{2}\t{3}'.format(
len(region[1]),
",".join([str(int(y) - int(x)) for x, y in region[1]]),
",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
self.matrix.group_labels[label_idx]))
+ if self.matrix.silhouette is not None:
+ file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
+ file_handle.write("\n")
file_handle.close()
@staticmethod
@@ -1053,6 +1059,23 @@ def get_num_individual_matrix_cols(self):
return matrixCols
+def computeSilhouetteScore(d, idx, labels):
+ """
+ Given a square distance matrix with NaN diagonals, compute the silhouette score
+ of a given row (idx). Each row should have an associated label (labels).
+ """
+ keep = ~np.isnan(d[idx, ])
+ foo = np.bincount(labels[keep], weights=d[idx, ][keep])
+ groupSizes = np.bincount(labels[keep])
+ intraIdx = labels[idx]
+ if groupSizes[intraIdx] == 1:
+ return 0
+ intra = foo[labels[idx]] / groupSizes[intraIdx]
+ interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
+ inter = np.min(foo[interMask] / groupSizes[interMask])
+ return (inter - intra) / max(inter, intra)
+
+
class _matrix(object):
"""
class to hold heatmapper matrices
@@ -1082,6 +1105,7 @@ def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
self.sample_boundaries = sample_boundaries
self.sort_method = None
self.sort_using = None
+ self.silhouette = None
if group_labels is None:
self.group_labels = ['group {}'.format(x)
@@ -1225,7 +1249,7 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
self.regions = _sorted_regions
self.set_sorting_method(sort_method, sort_using)
- def hmcluster(self, k, method='kmeans', clustering_samples=None):
+ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
matrix = np.asarray(self.matrix)
matrix_to_cluster = matrix
if clustering_samples is not None:
@@ -1295,8 +1319,25 @@ def hmcluster(self, k, method='kmeans', clustering_samples=None):
self.regions = _clustered_regions
self.matrix = np.vstack(_clustered_matrix)
+
return idx
+ def computeSilhouette(self, k):
+ if k > 1:
+ from scipy.spatial.distance import pdist, squareform
+
+ silhouette = np.repeat(0.0, self.group_boundaries[-1])
+ groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
+ labels = np.repeat(np.arange(k), groupSizes)
+
+ d = pdist(self.matrix)
+ d2 = squareform(d)
+ np.fill_diagonal(d2, np.nan) # This excludes the diagonal
+ for idx in range(len(labels)):
+ silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
+ sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
+ self.silhouette = silhouette
+
def removeempty(self):
"""
removes matrix rows containing only zeros or nans
diff --git a/deeptools/parserCommon.py b/deeptools/parserCommon.py
index 073953126..c76eb603f 100644
--- a/deeptools/parserCommon.py
+++ b/deeptools/parserCommon.py
@@ -505,6 +505,17 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]):
'fail with an error if a cluster has very few members compared to the '
'total number of regions.',
type=int)
+ cluster.add_argument(
+ '--silhouette',
+ help='Compute the silhouette score for regions. This is only'
+ ' applicable if clustering has been performed. The silhouette score'
+ ' is a measure of how similar a region is to other regions in the'
+ ' same cluster as opposed to those in other clusters. It will be reported'
+ ' in the final column of the BED file with regions. The '
+ 'silhouette evaluation can be very slow when you have more'
+ 'than 100 000 regions.',
+ action='store_true'
+ )
optional = parser.add_argument_group('Optional arguments')
diff --git a/deeptools/plotHeatmap.py b/deeptools/plotHeatmap.py
index e73ca377b..8bd6b42e6 100644
--- a/deeptools/plotHeatmap.py
+++ b/deeptools/plotHeatmap.py
@@ -780,14 +780,11 @@ def main(args=None):
if args.sortRegions == 'keep':
args.sortRegions = 'no' # These are the same thing
if args.kmeans is not None:
- hm.matrix.hmcluster(args.kmeans, method='kmeans',
- clustering_samples=args.clusterUsingSamples)
- else:
- if args.hclust is not None:
- print("Performing hierarchical clustering."
- "Please note that it might be very slow for large datasets.\n")
- hm.matrix.hmcluster(args.hclust, method='hierarchical',
- clustering_samples=args.clusterUsingSamples)
+ hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples)
+ elif args.hclust is not None:
+ print("Performing hierarchical clustering."
+ "Please note that it might be very slow for large datasets.\n")
+ hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples)
group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions)
if np.any(group_len_ratio < 5.0 / 1000):
@@ -816,6 +813,12 @@ def main(args=None):
sort_method=args.sortRegions,
sample_list=sortUsingSamples)
+ if args.silhouette:
+ if args.kmeans is not None:
+ hm.matrix.computeSilhouette(args.kmeans)
+ elif args.hclust is not None:
+ hm.matrix.computeSilhouette(args.args.hclust)
+
if args.outFileNameMatrix:
hm.save_matrix_values(args.outFileNameMatrix)
diff --git a/galaxy/wrapper/deepTools_macros.xml b/galaxy/wrapper/deepTools_macros.xml
index 8547c77ef..9b39a300b 100644
--- a/galaxy/wrapper/deepTools_macros.xml
+++ b/galaxy/wrapper/deepTools_macros.xml
@@ -164,6 +164,12 @@
+
@@ -181,6 +187,7 @@
--hclust $advancedOpt.used_multiple_regions.clustering.n_hclust
#end if
#end if
+ $advancedOpt.used_multiple_regions.silhouette
#end if