-
Notifications
You must be signed in to change notification settings - Fork 34
/
mixhop_dataset.py
188 lines (151 loc) · 6.52 KB
/
mixhop_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import collections
import os
import pickle
import sys
import numpy
import scipy.sparse
import tensorflow as tf
def concatenate_csr_matrices_by_rows(matrix1, matrix2):
"""Concatenates sparse csr matrices matrix1 above matrix2.
Adapted from:
https://stackoverflow.com/questions/6844998/is-there-an-efficient-way-of-concatenating-scipy-sparse-matrices
"""
new_data = numpy.concatenate((matrix1.data, matrix2.data))
new_indices = numpy.concatenate((matrix1.indices, matrix2.indices))
new_ind_ptr = matrix2.indptr + len(matrix1.data)
new_ind_ptr = new_ind_ptr[1:]
new_ind_ptr = numpy.concatenate((matrix1.indptr, new_ind_ptr))
return scipy.sparse.csr_matrix((new_data, new_indices, new_ind_ptr))
def load_x(filename):
if sys.version_info > (3, 0):
return pickle.load(open(filename, 'rb'), encoding='latin1')
else:
return numpy.load(filename)
def ReadDataset(dataset_dir, dataset_name):
"""Returns dataset files given e.g. ind.pubmed as a dataset_name.
Args:
dataset_dir: `data` directory of planetoid datasets.
dataset_name: One of "ind.citeseer", "ind.cora", or "ind.pubmed".
Returns:
Dataset object (defined below).
"""
base_path = os.path.join(dataset_dir, dataset_name)
edge_lists = pickle.load(open(base_path + '.graph', 'rb'))
allx = load_x(base_path + '.allx')
ally = numpy.array(numpy.load(base_path + '.ally'), dtype='float32')
# TODO(haija): Support Homophily Datasets [and upload them]
if False: #FLAGS.homophily_dataset:
# TODO(haija): Support Homophily and analysis of delta-op.
if False: #FLAGS.analyze_delta_opness:
x_distances = []
radius = 300
angles = numpy.arange(0, numpy.pi*2, numpy.pi*2/50.0)
for theta in angles:
gaussian_y = radius * numpy.cos(theta)
gaussian_x = radius * numpy.sin(theta)
center = numpy.array([gaussian_x, gaussian_y])
x_distances.append(
numpy.expand_dims( ((allx - center)**2).sum(axis=1), 1))
x_sims = numpy.concatenate(x_distances, axis=1)
x_sims = numpy.exp(-1e-5*x_sims)
allx = x_sims
llallx = scipy.sparse.csr_matrix(allx).tolil()
num_train = len(ally) / 3
# TODO Load from offline.
test_idx = range(num_train*2, num_train*3)
else:
testx = load_x(base_path + '.tx')
# Add test
test_idx = list(map(int, open(base_path + '.test.index').read().split('\n')[:-1]))
num_test_examples = max(test_idx) - min(test_idx) + 1
sparse_zeros = scipy.sparse.csr_matrix((num_test_examples, allx.shape[1]),
dtype='float32')
allx = concatenate_csr_matrices_by_rows(allx, sparse_zeros)
llallx = allx.tolil()
llallx[test_idx] = testx
#allx = scipy.vstack([allx, sparse_zeros])
test_idx_set = set(test_idx)
testy = numpy.array(numpy.load(base_path + '.ty'), dtype='float32')
ally = numpy.concatenate(
[ally, numpy.zeros((num_test_examples, ally.shape[1]), dtype='float32')],
0)
ally[test_idx] = testy
num_nodes = len(edge_lists)
# Will be used to construct (sparse) adjacency matrix.
edge_sets = collections.defaultdict(set)
for node, neighbors in edge_lists.items():
edge_sets[node].add(node) # Add self-connections
for n in neighbors:
edge_sets[node].add(n)
edge_sets[n].add(node) # Assume undirected.
# Now, build adjacency list.
adj_indices = []
adj_values = []
for node, neighbors in edge_sets.items():
for n in neighbors:
adj_indices.append((node, n))
adj_values.append(1 / (numpy.sqrt(len(neighbors) * len(edge_sets[n]))))
adj_indices = numpy.array(adj_indices, dtype='int32')
adj_values = numpy.array(adj_values, dtype='float32')
return Dataset(
num_nodes=num_nodes, edge_sets=edge_sets, test_indices=test_idx,
adj_indices=adj_indices, adj_values=adj_values, allx=llallx, ally=ally)
class Dataset(object):
"""Dataset object giving access to sparse feature & adjacency matrices.
Access the matrices, as a sparse tensor, using functions:
Features: sparse_allx_tensor()
Adjacency: sparse_adj_tensor().
If you use these tensors in a tensorflow graph, you must supply their
dependencies. In particular, feed them into your feed dictionary like:
feed_dict = {} # and populate it with your own placeholders etc.
dataset.populate_feed_dict(feed_dict) # makes adj and allx tensors runnable.
"""
def __init__(self, allx=None, ally=None, num_nodes=None, test_indices=None,
edge_sets=None, adj_indices=None, adj_values=None):
self.allx = allx
self.ally = ally
self.num_nodes = num_nodes
self.edge_sets = edge_sets
self.adj_indices = adj_indices
self.adj_values = adj_values
self.sp_adj_tensor = None
self.sp_allx_tensor = None
self.test_indices = test_indices
def populate_feed_dict(self, feed_dict):
"""Adds the adjacency matrix and allx to placeholders."""
sp_adj_tensor = self.sparse_adj_tensor()
feed_dict[self.x_indices_ph] = self.x_indices
feed_dict[self.indices_ph] = self.adj_indices
feed_dict[self.values_ph] = self.adj_values
def sparse_allx_tensor(self):
if self.sp_allx_tensor is None:
xrows, xcols = self.allx.nonzero()
self.x_indices = numpy.concatenate(
[numpy.expand_dims(xrows, 1), numpy.expand_dims(xcols, 1)], axis=1)
# TODO(haija): Support Homophily Datasets [and upload them]
if False: # FLAGS.homophily_dataset:
x_values = numpy.array(self.allx[xrows, xcols].todense(), dtype='float32')[0]
else:
x_values = tf.ones([len(xrows)], dtype=tf.float32)
self.x_indices_ph = tf.placeholder(
tf.int64, [len(xrows), 2], name='x_indices')
dense_shape = self.allx.shape
self.sp_allx_tensor = tf.SparseTensor(
self.x_indices_ph, x_values, dense_shape)
return self.sp_allx_tensor
def sparse_adj_tensor(self):
if self.sp_adj_tensor is None:
self.indices_ph = tf.placeholder(
tf.int64, [len(self.adj_indices), 2], name='indices')
self.values_ph = tf.placeholder(
tf.float32, [len(self.adj_indices)], name='values')
dense_shape = [self.num_nodes, self.num_nodes]
self.sp_adj_tensor = tf.SparseTensor(
self.indices_ph, self.values_ph, dense_shape)
return self.sp_adj_tensor
def get_partition_indices(self, num_train_nodes, num_validate_nodes):
train_indices = list(range(num_train_nodes))
validate_indices = list(range(
num_train_nodes, num_train_nodes + num_validate_nodes))
test_indices = self.test_indices
return train_indices, validate_indices, test_indices