From a28a8f6df7a59c06f7030eddcae8e3ab371bbb99 Mon Sep 17 00:00:00 2001
From: Matt Olm <mattolm@gmail.com>
Date: Thu, 27 Feb 2020 08:57:23 -0800
Subject: [PATCH] 2.5.3

---
 CHANGELOG.md      |  5 +++++
 drep/VERSION      |  2 +-
 drep/d_cluster.py | 49 +++++++++++++++++++++++++++--------------------
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5dc7279..2b29109 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project (attempts to) adhere to [Semantic Versioning](http://semver.org/).
 
+## [2.5.3] - 2020-02-27
+### Fixed
+- More bug fixes related to FastANI
+- Allow loading of cached Ndb.csv
+
 ## [2.5.2] - 2020-02-26
 ### Fixed
 - The bug I tried to fix in 2.5.1 is able to fix itself
diff --git a/drep/VERSION b/drep/VERSION
index f225a78..aedc15b 100644
--- a/drep/VERSION
+++ b/drep/VERSION
@@ -1 +1 @@
-2.5.2
+2.5.3
diff --git a/drep/d_cluster.py b/drep/d_cluster.py
index 6edbb04..a57fc81 100644
--- a/drep/d_cluster.py
+++ b/drep/d_cluster.py
@@ -193,33 +193,40 @@ def cluster_genomes(genome_list, data_folder, **kwargs):
         kwargs.get('wd')._wipe_secondary_clusters()
 
     if not kwargs.get('SkipSecondary', False):
+        # See if cached
+        cached = (debug and wd.hasDb('Ndb'))
+
+        if cached:
+            logging.info('3. Loading cached secondary clustering')
+            Ndb = wd.get_db('Ndb')
+
+            # Get rid of broken ones
+            Ndb = Ndb.dropna(subset=['reference'])
+
+            logging.info('3. Secondary clustering cache loaded')
+
         # Run comparisons, make Ndb
-        _print_time_estimate(Bdb, Cdb, algorithm, kwargs.get('processors', 6))
-        Ndb = pd.DataFrame()
-        for bdb, name in iteratre_clusters(Bdb,Cdb):
-            logging.debug('running cluster {0}'.format(name))
-            #logging.debug('total memory - {0:.2f} Mbp'.format(int(process.memory_info().rss)/1000000))
-            try:
+        else:
+            _print_time_estimate(Bdb, Cdb, algorithm, kwargs.get('processors', 6))
+            Ndb = pd.DataFrame()
+            for bdb, name in iteratre_clusters(Bdb,Cdb):
+                logging.debug('running cluster {0}'.format(name))
+                #logging.debug('total memory - {0:.2f} Mbp'.format(int(process.memory_info().rss)/1000000))
                 ndb = compare_genomes(bdb, algorithm, data_folder, **kwargs)
-                ndb['primary_cluster'] = name
-                Ndb = Ndb.append(ndb)
-
-            except:
-                logging.error("CRITICAL ERROR WITH PRIMARY CLUSTER {0}; TRYING AGAIN".format(name))
 
-                try:
+                if len(ndb) == 0:
+                    logging.error("CRITICAL ERROR WITH PRIMARY CLUSTER {0}; TRYING AGAIN".format(name))
                     ndb = compare_genomes(bdb, algorithm, data_folder, **kwargs)
-                    if len(ndb) > 0:
-                        ndb['primary_cluster'] = name
-                        Ndb = Ndb.append(ndb)
-                    else:
-                        logging.error("CRITICAL ERROR AGAIN WITH PRIMARY CLUSTER {0}; SKIPPING".format(name))
-                except:
+
+                if len(ndb) > 0:
+                    ndb['primary_cluster'] = name
+                    Ndb = Ndb.append(ndb)
+                else:
                     logging.error("DOUBLE CRITICAL ERROR AGAIN WITH PRIMARY CLUSTER {0}; SKIPPING".format(name))
 
-        if debug:
-            logging.debug("Debug mode on - saving Ndb ASAP")
-            wd.store_db(Ndb, 'Ndb')
+            if debug:
+                logging.debug("Debug mode on - saving Ndb ASAP")
+                wd.store_db(Ndb, 'Ndb')
 
         # Run clustering on Ndb
         Cdb, c2ret = _cluster_Ndb(Ndb, comp_method=algorithm, **kwargs)