forked from derekgreene/topic-ensemble
-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine-nmf.py
executable file
·122 lines (104 loc) · 5.26 KB
/
combine-nmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
"""
Tool to combine a collection of base topic models, generated by NMF, to produce a single ensemble topic model.
Sample usage:
python combine-nmf.py sample.pkl models/base/*factors*.pkl -k 4 -o models/ensemble
"""
import os, sys, random, operator
import logging as log
from optparse import OptionParser
import numpy as np
import sklearn.preprocessing
import unsupervised.nmf, unsupervised.rankings, unsupervised.util
import text.util
# --------------------------------------------------------------
def main():
parser = OptionParser(usage="usage: %prog [options] corpus_file base_factors1 base_factors2...")
parser.add_option("--seed", action="store", type="int", dest="seed", help="initial random seed", default=1000)
parser.add_option("-k", action="store", type="string", dest="k", help="number of topics", default=10)
parser.add_option("--maxiters", action="store", type="int", dest="maxiter", help="maximum number of iterations", default=500)
parser.add_option("-o","--outdir", action="store", type="string", dest="dir_out", help="output directory (default is current directory)", default=None)
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="display topic descriptors")
(options, args) = parser.parse_args()
if( len(args) < 3 ):
parser.error( "Must specify corpus file and at least two base factor files" )
log.basicConfig(level=20, format='%(message)s')
# Parse user-specified range for number of topics K
k = int(options.k)
# Output directory for results
if options.dir_out is None:
dir_out = os.getcwd()
else:
dir_out = options.dir_out
if not os.path.exists(dir_out):
os.makedirs(dir_out)
# Set random state
random_seed = options.seed
if random_seed < 0:
random_seed = random.randint(1,100000)
np.random.seed( random_seed )
random.seed( random_seed )
log.info("Using random seed %s" % random_seed )
# Load the cached corpus
log.info( "Loading data from %s ..." % args[0] )
(X,all_terms,all_doc_ids,classes) = text.util.load_corpus( args[0] )
log.info( "Read corpus with %d documents, %d terms" % ( len(all_doc_ids), len(all_terms) ) )
# Process each specified base topic model
log.info("Processing %d base topic models ..." % len(args[1:]) )
factors = []
for base_idx, base_model_path in enumerate(args[1:]):
# load the cached base topics
base_name = os.path.splitext( os.path.split( base_model_path )[-1] )[0]
(base_W,base_H,base_doc_ids,base_terms) = unsupervised.util.load_nmf_factors( base_model_path )
base_k = base_W.shape[1]
log.debug("Base model %d: Read %d base topics from %s" % (base_idx + 1, base_k, base_model_path) )
# label the topics
base_topic_labels = []
for i in range(base_k):
base_topic_labels.append( "%03d-%02d" % ( base_idx+1, i+1 ) )
# add it to the ensemble collection
factors.append( base_H )
# Merge the H factors to create the topic-term matrix
M = np.vstack( factors )
log.info( "Created topic-term matrix of size %dx%d" % M.shape )
log.info( "Matrix statistics: range=[%.2f,%.2f] mean=%.2f" % ( np.min(M), np.max(M), np.mean(M) ) )
# NMF implementation
impl = unsupervised.nmf.SklNMF( max_iters = options.maxiter, init_strategy = "nndsvd" )
log.info( "Applying ensemble combination to topic-term matrix for k=%d topics ..." % k )
impl.apply( M, k )
ensemble_H = np.array( impl.H )
ensemble_W = np.array( impl.W )
log.info( "Generated %dx%d factor W and %dx%d factor H" % ( ensemble_W.shape[0], ensemble_W.shape[1], ensemble_H.shape[0], ensemble_H.shape[1] ) )
# Create term rankings for each topic
term_rankings = []
for topic_index in range(k):
ranked_term_indices = impl.rank_terms( topic_index )
term_ranking = [all_terms[i] for i in ranked_term_indices]
term_rankings.append(term_ranking)
# Print out the top terms?
if options.verbose:
log.info( unsupervised.rankings.format_term_rankings( term_rankings, top = 10 ) )
# Write term rankings
ranks_out_path = os.path.join( dir_out, "ranks_ensemble_k%02d.pkl" % k )
log.info( "Writing ensemble term ranking set to %s" % ranks_out_path )
unsupervised.util.save_term_rankings( ranks_out_path, term_rankings )
# Write the complete factorization
factor_out_path = os.path.join( dir_out, "factors_ensemble_k%02d.pkl" % k )
log.info( "Writing complete ensemble factorization to %s" % factor_out_path )
unsupervised.util.save_nmf_factors( factor_out_path, ensemble_W, ensemble_H, all_doc_ids, all_terms )
# Now finally fold in the documents to assign them to topics
HT = sklearn.preprocessing.normalize( ensemble_H.T, "l2", axis=0 )
D = X.dot(HT)
# Create a disjoint partition for the documents
doc_partition = np.argmax( D, axis = 1 ).flatten().tolist()
# Now write the results
doc_factor_out_path = os.path.join( dir_out, "factors_final_k%02d.pkl" % k )
log.info( "Writing ensemble factorization to %s" % doc_factor_out_path )
unsupervised.util.save_nmf_factors( doc_factor_out_path, D, ensemble_H, all_doc_ids, all_terms )
# Write document partition
doc_partition_out_path = os.path.join( dir_out, "partition_final_k%02d.pkl" % k )
log.info( "Writing ensemble document partition to %s" % doc_partition_out_path )
unsupervised.util.save_partition( doc_partition_out_path, doc_partition, all_doc_ids )
# --------------------------------------------------------------
if __name__ == "__main__":
main()