-
Notifications
You must be signed in to change notification settings - Fork 7
/
gecmi.cpp
197 lines (176 loc) · 7.89 KB
/
gecmi.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#include <fstream>
#include <stdexcept>
#include <system_error>
#include <boost/program_options.hpp>
#include <boost/numeric/ublas/io.hpp>
#include "cluster_reader.hpp"
#include "bimap_cluster_populator.hpp"
#include "calculate_till_tolerance.hpp"
using std::string;
using std::vector;
using std::cout;
using std::ifstream;
using std::to_string;
using std::invalid_argument;
using std::domain_error;
using std::system_error;
namespace po = boost::program_options;
using namespace gecmi;
int main(int argc, char* argv[])
{
string descrstr = string("Generalized Conventional Mutual Information (GenConvMI)\n"
"Evaluates NMI (normalized by max and sqrt) for overlapping (soft, fuzzy) clusters (communities)"
", compatible with standard NMI\n"
"https://github.com/eXascaleInfolab/GenConvNMI"
"\n\nUsage:\t").append(argv[0]).append(" [options] <clusters1> <clusters2>\n"
"clusters - clusters file in the CNL format (https://github.com/eXascaleInfolab/PyCABeM/blob/master/formats/format.cnl),"
" where each line lists space separated ids of the cluster members\n"
"\nOptions");
po::options_description desc(descrstr);
po::positional_options_description p;
p.add( "input", 2 );
desc.add_options()
("help,h", "produce help message")
("input",
po::value<vector<string>>()->composing(),
"name of the input files" )
("sync,s",
po::value<string>(),
"synchronize the node base omitting the non-matching nodes\n"
"NOTE: The node base is either the first input file or '-' (automatic selection"
" of the input file having the least number of nodes)")
("id-remap,i", "remap ids allowing arbitrary input ids (non-contiguous ranges)"
", otherwise ids should form a solid range and start from 0 or 1")
("nmis,n", "output both NMI [max] and NMI_sqrt")
("fnmi,f", "evaluate also FNMI, includes '-n'")
("risk,r",
po::value<double>()->default_value(0.01),
"probability of value being outside" )
("error,e",
po::value<double>()->default_value(0.01),
"admissible error" )
("fast,a", "apply fast approximate evaluations that are less accurate"
", but much faster on large networks")
("membership,m",
po::value<float>()->default_value(1.f),
"average expected membership of nodes in the clusters, > 0, typically >= 1")
("retain-dups,d", "retain duplicated clusters if any instead of filtering them out"
" (not recommended)")
;
po::variables_map vm;
po::store( po::command_line_parser(argc, argv)
.options(desc)
.positional(p).run(), vm
);
po::notify(vm);
if ( vm.count("help" ) )
{
cout << desc << std::endl;
return 1;
}
vector< string > positionals;
// Whether the node base is explicitly specified as the first input file (not "-")
const bool ndbase1 = vm.count("sync") && vm["sync"].as<string>().compare("-");
try {
// Consider that the first input file can be a sync node base
if(ndbase1)
positionals.push_back(vm["sync"].as<string>());
for(auto& finp: vm["input"].as<vector<string>>())
positionals.push_back(finp);
} catch ( boost::bad_any_cast const& ) {
fprintf(stderr, "Please, provide two input files to proceed. Use `gecmi -h` for more info\n");
throw;
}
if ( positionals.size() != 2 )
throw invalid_argument("Please provide exactly two input files as input\n");
ifstream in1(positionals[0].c_str());
if( !in1 )
throw system_error(errno, std::system_category(), "Could not open the first file\n");
ifstream in2(positionals[1].c_str());
if( !in2 )
throw system_error(errno, std::system_category(), "Could not open the second file\n");
// Read the clusters
two_relations_t two_rel;
bimap_cluster_populator bcp1( two_rel.first );
bimap_cluster_populator bcp2( two_rel.second );
// bimap_cluster_populator: left: Nodes, right: Clusters
size_t b1lnum=0, b2lnum=0; // The number of nodes in the collections
const float membership = vm["membership"].as<float>();
if(membership <= 0)
throw invalid_argument("membership = " + to_string(membership)
+ " should be positive");
{
const bool remap = vm.count("id-remap"); // Remap ids
const bool fltdups = !vm.count("retain-dups"); // Filter out duplicated clusters
IdMap idmap; // Mapping of ids to provide solid range starting from 0 if required
size_t nmods1 = 0; // The number of UNIQUE clusters (modules) in the first collection
size_t nmods2 = 0;
#ifdef DEBUG
fprintf(stderr, "Loading %s...\n", positionals[0].c_str());
#endif // DEBUG
b1lnum = read_clusters(
in1,
bcp1,
positionals[0].c_str(),
remap ? &idmap : nullptr,
membership, fltdups, &nmods1
);
#ifdef DEBUG
fprintf(stderr, "Loading %s...\n", positionals[1].c_str());
#endif // DEBUG
b2lnum = read_clusters(
in2,
bcp2,
positionals[1].c_str(),
remap ? &idmap : nullptr,
membership, fltdups, &nmods2
);
// Consider the case of single cluster collections, where NMI is not applicable
if(nmods1 != nmods2 && (nmods1 == 1 || nmods2 == 1))
throw domain_error("ERROR, NMI is not applicable for the single cluster collections\n");
}
#ifdef DEBUG
assert(b1lnum == bcp1.uniqlSize() && b2lnum == bcp2.uniqlSize()
&& "");
#endif // DEBUG
if(b1lnum != b2lnum) {
const bool sync = vm.count("sync");
fprintf(stderr, "WARNING, evaluating collections have different number of nodes: %lu != %lu"
", sync enabled: %s (forced to finp1: %s)\n", b1lnum, b2lnum
, sync ? "yes" : "no (the quality will be penalized)", ndbase1 ? "yes" : "no");
// Synchronize the number of nodes in both collections if required
if (sync) {
// Sync to bcp1 if the sync is automatic ("-") and bcp1 has the lowest number of nodes
// or if the sync is forced to bcp1 (not "-")
if(b1lnum <= b2lnum || ndbase1) { // bcp1 is the base for the sync, the nodes are removed from bcp2
bcp2.sync(bcp1);
b2lnum = bcp2.uniqlSize();
} else {
bcp1.sync(bcp2); // bcp2 is the base for the sync, the nodes are removed from bcp1
b1lnum = bcp1.uniqlSize();
}
// Show WARNING if the synchronization is failed or
if(b1lnum != b2lnum) {
//throw domain_error("Input collections have different node base and can't be synchronized gracefully: "
// + to_string(b1lnum) + " != " + to_string(b2lnum)+ "\n");
fprintf(stderr, "WARNING, full synchronization failed, the nodes in the collections differ"
" after the partial synchronization: %lu != %lu\n", b1lnum, b2lnum);
}
}
}
const double risk = vm["risk" ].as<double>();
const double epvar = vm["error"].as<double>();
calculated_info_t cit = calculate_till_tolerance( two_rel, risk, epvar
, vm.count("fast"), b1lnum, b2lnum );
if (vm.count("fnmi")) {
const auto b1rnum = bcp1.uniqrSize();
const auto b2rnum = bcp2.uniqrSize();
printf("NMI_max: %G, FNMI: %G, NMI_sqrt: %G; cls1: %lu, cls2: %lu\n", cit.nmi
// Note: 2^x is used instead of e^x to have the same base as in the log
, cit.nmi * pow(2, -abs(double(b1rnum - b2rnum)) / std::max(b1rnum, b2rnum))
, cit.nmi_sqrt, b1rnum, b2rnum);
} else if (vm.count("nmis"))
printf("NMI_max: %G, NMI_sqrt: %G\n", cit.nmi, cit.nmi_sqrt);
else printf("%G\n", cit.nmi);
return 0;
}