-
Notifications
You must be signed in to change notification settings - Fork 4
/
CleanGraph.h
302 lines (280 loc) · 10.9 KB
/
CleanGraph.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#ifndef SPEEDPPR_CLEANGRAPH_H
#define SPEEDPPR_CLEANGRAPH_H
#ifndef DEEP_CLEAN
#define DEEP_CLEAN
#endif
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include <unordered_set>
#include <algorithm>
#include "BasicDefinition.h"
#include "HelperFunctions.h"
class CleanGraph {
VertexIdType numOfVertices = 0;
EdgeSizeType numOfEdges = 0;
public:
void duplicate_edges(const std::string &_input_file,
const std::string &_output_file) {
////////////////////////////////////////////////////////////
std::ifstream inf(_input_file.c_str());
if (!inf.is_open()) {
printf("CleanGraph::duplicate_edges; File not exists.\n");
printf("%s\n", _input_file.c_str());
exit(1);
}
////////////////////////////////////////////////////////////
// status indicator
printf("\nReading Input Graph\n");
std::string line;
/**
* skip the headers, we assume the headers are the comments that
* begins with '#'
*/
while (std::getline(inf, line) && line[0] == '#') {}
if (line.empty() || !isdigit(line[0])) {
printf("Error in CleanGraph::duplicate_edges.Raw File Format Error.\n");
printf("%s\n", line.c_str());
exit(1);
}
////////////////////////////////////////////////////////////////////////////////
// create temporary graph
std::vector<Edge> edges;
numOfEdges = 0;
/**
* read the raw file
*/
size_t num_lines = 0;
// process the first line=
{
VertexIdType fromId, toID;
++num_lines;
size_t end = 0;
fromId = std::stoul(line, &end);
toID = std::stoul(line.substr(end));
edges.emplace_back(fromId, toID);
// duplicate the edge
edges.emplace_back(toID, fromId);
}
// read the edges
for (VertexIdType fromId, toID; inf >> fromId >> toID;) {
edges.emplace_back(fromId, toID);
// duplicate the edge
edges.emplace_back(toID, fromId);
if (++num_lines % 5000000 == 0) { printf("%zu Valid Lines Read.\n", num_lines); }
}
////////////////////////////////////////////////////////////////////////////////
// close the file
inf.close();
/* final count */
printf("%zu Lines Read.\n", num_lines);
printf("Finish Reading.\n");
numOfEdges = edges.size();
printf("There are %d Edges After Duplications.\n", numOfEdges);
// write the file
if (FILE *file = fopen(_output_file.c_str(), "w")) {
for (const auto &edge : edges) {
fprintf(file, "%u\t%u\n", edge.from_id, edge.to_id);
}
printf("%d Edges Written. Duplication Finished.\n", numOfEdges);
printf("\n%s\n", std::string(80, '=').c_str());
fclose(file);
} else {
printf("CleanGraph::duplicate_edges; Output File Can Not Be Openned.\n");
printf("%s\n", _output_file.c_str());
}
}
/**
* Each vertex is then given a label between 0 to (#vertices - 1).
* Currently We don't deal with parallel edges.
* @param _input_file
* @param _is_directed
* @param _output_folder
*/
void clean_graph(const std::string &_input_file,
const std::string &_output_folder) {
////////////////////////////////////////////////////////////
std::ifstream inf(_input_file.c_str());
if (!inf.is_open()) {
printf("CleanGraph::clean_graph; File not exists.\n");
printf("%s\n", _input_file.c_str());
exit(1);
}
////////////////////////////////////////////////////////////
// status indicator
printf("\nReading Input Graph\n");
std::string line;
/**
* skip the headers, we assume the headers are the comments that
* begins with '#'
*/
while (std::getline(inf, line) && line[0] == '#') {}
if (line.empty() || !isdigit(line[0])) {
printf("Error in CleanGraph::clean_graph. Raw File Format Error.\n");
printf("%s\n", line.c_str());
exit(1);
}
////////////////////////////////////////////////////////////////////////////////
// create temporary graph
std::vector<Edge> edges;
numOfEdges = 0;
/**
* read the raw file
*/
size_t num_lines = 0;
// process the first line
{
VertexIdType fromId, toID;
++num_lines;
size_t end = 0;
fromId = std::stoul(line, &end);
toID = std::stoul(line.substr(end));
// remove self-loops
#ifdef DEEP_CLEAN
if (fromId != toID) { edges.emplace_back(fromId, toID); }
#else
edges.emplace_back(fromId, toID);
#endif
}
// read the edges
for (VertexIdType fromId, toID; inf >> fromId >> toID;) {
#ifdef DEEP_CLEAN
if (fromId != toID) { edges.emplace_back(fromId, toID); }
#else
edges.emplace_back(fromId, toID);
#endif
if (++num_lines % 5000000 == 0) { printf("%zu Valid Lines Read.\n", num_lines); }
}
////////////////////////////////////////////////////////////////////////////////
// close the file
inf.close();
/* final count */
printf("%zu Lines Read.\n", num_lines);
numOfEdges = edges.size();
printf("%d-th Non-Self Loop Edges.\n", numOfEdges);
printf("Finish Reading.\n");
printf("\n%s\n", std::string(30, '-').c_str());
// find the maximum id
size_t id_max = 0;
size_t id_min = std::numeric_limits<uint32_t>::max();
for (const auto &pair : edges) {
id_max = std::max(id_max, (size_t) std::max(pair.from_id, pair.to_id));
id_min = std::min(id_min, (size_t) std::min(pair.from_id, pair.to_id));
}
printf("Maximum ID: %zu\n", id_max);
printf("Minimum ID: %zu\n", id_min);
if (id_max >= std::numeric_limits<uint32_t>::max()) {
printf("Warning. Change VertexIdType First.\n");
exit(1);
}
const VertexIdType one_plus_id_max = id_max + 1;
std::vector<VertexIdType> out_degree(one_plus_id_max, 0);
std::vector<VertexIdType> in_degree(one_plus_id_max, 0);
// compute the degrees.
for (const auto &edge : edges) {
++out_degree[edge.from_id];
++in_degree[edge.to_id];
}
// count the number of dead-end vertices
uint32_t original_dead_end_num = 0;
uint32_t num_isolated_points = 0;
uint32_t max_degree = 0;
for (VertexIdType id = 0; id < one_plus_id_max; ++id) {
if (out_degree[id] == 0) {
++original_dead_end_num;
if (in_degree[id] == 0) {
++num_isolated_points;
}
}
// compute maximum out degree
max_degree = std::max(out_degree[id], max_degree);
}
printf("The number of dead end vertices: %u\n", original_dead_end_num);
printf("The number of Isolated Points: %u\n", num_isolated_points);
printf("The maximum out degree is: %u\n", max_degree);
// re-label the vertices.
#ifdef DEEP_CLEAN
std::vector<std::vector<VertexIdType>> out_adj_list(one_plus_id_max, std::vector<VertexIdType>());
for (const auto &edge : edges) {
out_adj_list[edge.from_id].push_back(edge.to_id);
}
numOfVertices = 0;
std::vector<bool> visited(one_plus_id_max, false);
std::vector<VertexIdType> id_map(one_plus_id_max, 0);
for (VertexIdType index = 0; index < one_plus_id_max; ++index) {
// skip singleton and visited vertices
if (!visited[index] && out_degree[index] > 0) {
std::vector<VertexIdType> current_level({index});
visited[index] = true;
std::vector<VertexIdType> next_level;
while (!current_level.empty()) {
for (const VertexIdType ¤t_id : current_level) {
id_map[current_id] = numOfVertices;
++numOfVertices;
for (const VertexIdType &nid : out_adj_list[current_id]) {
if (!visited[nid]) {
next_level.push_back(nid);
visited[nid] = true;
}
}
}
current_level.swap(next_level);
next_level.clear();
}
}
}
printf("Relabeling Finish.\n");
printf("The number of valid vertices: %u\n", numOfVertices);
// re-write the edges ids
for (auto &edge : edges) {
edge.from_id = id_map[edge.from_id];
edge.to_id = id_map[edge.to_id];
}
#else
// we assume the vertice ids are in the arrange of 0 ... numOfVertices - 1
numOfVertices = one_plus_id_max;
#endif
// sort the edges
std::sort(edges.begin(), edges.end());
// Write the attribute file
numOfEdges = edges.size();
std::string attribute_file = _output_folder + '/' + "attribute.txt";
if (std::FILE *file = std::fopen(attribute_file.c_str(), "w")) {
std::fprintf(file, "n=%d\nm=%d\n", numOfVertices, numOfEdges);
std::fclose(file);
} else {
printf("Graph::clean_graph; File Not Exists.\n");
printf("%s\n", attribute_file.c_str());
exit(1);
}
// write the graph in txt
std::string graph_txt_file = _output_folder + '/' + "graph.txt";
if (std::FILE *file = std::fopen(graph_txt_file.c_str(), "w")) {
fprintf(file, "%u\n", numOfVertices);
for (const auto &edge : edges) {
fprintf(file, "%u\t%u\n", edge.from_id, edge.to_id);
}
printf("Writing Txt Finished.\n");
std::fclose(file);
} else {
printf("Graph::clean_graph; File Not Exists.\n");
printf("%s\n", graph_txt_file.c_str());
exit(1);
}
// write the graph in binary
std::string graph_bin_file = _output_folder + '/' + "graph.bin";
if (std::FILE *file = std::fopen(graph_bin_file.c_str(), "wb")) {
std::fwrite(edges.data(), sizeof edges[0], edges.size(), file);
printf("Writing Binary Finished.\n");
std::fclose(file);
} else {
printf("Graph::clean_graph; File Not Exists.\n");
printf("%s\n", graph_bin_file.c_str());
exit(1);
}
printf("%s\n", std::string(110, '=').c_str());
}
};
#endif //SPEEDPPR_CLEANGRAPH_H