combine-nmf.py

#!/usr/bin/env python
"""
Tool to combine a collection of base topic models, generated by NMF, to produce a single ensemble topic model.

Sample usage:
python combine-nmf.py sample.pkl models/base/*factors*.pkl -k 4 -o models/ensemble
"""
import os, sys, random, operator
import logging as log
from optparse import OptionParser
import numpy as np
import sklearn.preprocessing
import unsupervised.nmf, unsupervised.rankings, unsupervised.util
import text.util

# --------------------------------------------------------------

def main():
	parser = OptionParser(usage="usage: %prog [options] corpus_file base_factors1 base_factors2...")
	parser.add_option("--seed", action="store", type="int", dest="seed", help="initial random seed", default=1000)
	parser.add_option("-k", action="store", type="string", dest="k", help="number of topics", default=10)
	parser.add_option("--maxiters", action="store", type="int", dest="maxiter", help="maximum number of iterations", default=500)
	parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="output directory (default is current directory)", default=None)
	parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="display topic descriptors")
	(options, args) = parser.parse_args()
	if( len(args) < 3 ):
		parser.error( "Must specify corpus file and at least two base factor files" )
	log.basicConfig(level=20, format='%(message)s')

	# Parse user-specified range for number of topics K
	k = int(options.k)
	# Output directory for results
	if options.dir_out is None:
		dir_out = os.getcwd()
	else:
		dir_out = options.dir_out	
		if not os.path.exists(dir_out):
			os.makedirs(dir_out)	

	# Set random state
	random_seed = options.seed
	if random_seed < 0:
		random_seed = random.randint(1,100000)
	np.random.seed( random_seed )
	random.seed( random_seed )			
	log.info("Using random seed %s" % random_seed )
	
	# Load the cached corpus
	log.info( "Loading data from %s ..." % args[0] )
	(X,all_terms,all_doc_ids,classes) = text.util.load_corpus( args[0] )
	log.info( "Read corpus with %d documents, %d terms" % (  len(all_doc_ids), len(all_terms) ) )

	# Process each specified base topic model
	log.info("Processing %d base topic models ..." % len(args[1:]) )
	factors = []
	for base_idx, base_model_path in enumerate(args[1:]):
		# load the cached base topics
		base_name = os.path.splitext( os.path.split( base_model_path )[-1] )[0]
		(base_W,base_H,base_doc_ids,base_terms) = unsupervised.util.load_nmf_factors( base_model_path )
		base_k = base_W.shape[1]
		log.debug("Base model %d: Read %d base topics from %s" % (base_idx + 1, base_k, base_model_path) )
		# label the topics
		base_topic_labels = []
		for i in range(base_k):
			base_topic_labels.append( "%03d-%02d" % ( base_idx+1, i+1 ) )
		# add it to the ensemble collection
		factors.append( base_H )

	# Merge the H factors to create the topic-term matrix
	M = np.vstack( factors )
	log.info( "Created topic-term matrix of size %dx%d" % M.shape )
	log.info( "Matrix statistics: range=[%.2f,%.2f] mean=%.2f" % ( np.min(M), np.max(M), np.mean(M) ) )	

	# NMF implementation
	impl = unsupervised.nmf.SklNMF( max_iters = options.maxiter, init_strategy = "nndsvd" )
	log.info( "Applying ensemble combination to topic-term matrix for k=%d topics ..." % k )
	impl.apply( M, k )
	ensemble_H = np.array( impl.H )
	ensemble_W = np.array( impl.W )
	log.info( "Generated %dx%d factor W and %dx%d factor H" % ( ensemble_W.shape[0], ensemble_W.shape[1], ensemble_H.shape[0], ensemble_H.shape[1] ) )
	# Create term rankings for each topic
	term_rankings = []
	for topic_index in range(k):		
		ranked_term_indices = impl.rank_terms( topic_index )
		term_ranking = [all_terms[i] for i in ranked_term_indices]
		term_rankings.append(term_ranking)

	# Print out the top terms?
	if options.verbose:
		log.info( unsupervised.rankings.format_term_rankings( term_rankings, top = 10 ) )

	# Write term rankings
	ranks_out_path = os.path.join( dir_out, "ranks_ensemble_k%02d.pkl"  % k )
	log.info( "Writing ensemble term ranking set to %s" % ranks_out_path )
	unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )

	# Write the complete factorization
	factor_out_path = os.path.join( dir_out, "factors_ensemble_k%02d.pkl"  % k )
	log.info( "Writing complete ensemble factorization to %s" % factor_out_path )
	unsupervised.util.save_nmf_factors( factor_out_path, ensemble_W, ensemble_H, all_doc_ids, all_terms )

	# Now finally fold in the documents to assign them to topics
	HT = sklearn.preprocessing.normalize( ensemble_H.T, "l2", axis=0 )
	D = X.dot(HT)

	# Create a disjoint partition for the documents
	doc_partition = np.argmax( D, axis = 1 ).flatten().tolist()	

	# Now write the results
	doc_factor_out_path = os.path.join( dir_out, "factors_final_k%02d.pkl"  % k )
	log.info( "Writing ensemble factorization to %s" %  doc_factor_out_path )
	unsupervised.util.save_nmf_factors( doc_factor_out_path, D, ensemble_H, all_doc_ids, all_terms )

	# Write document partition
	doc_partition_out_path = os.path.join( dir_out, "partition_final_k%02d.pkl"  % k )
	log.info( "Writing ensemble document partition to %s" % doc_partition_out_path )
	unsupervised.util.save_partition( doc_partition_out_path, doc_partition, all_doc_ids )

# --------------------------------------------------------------

if __name__ == "__main__":
	main()