-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
214 lines (188 loc) · 9.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import pandas as pd
import math
import networkx as nx
import itertools as it
# Computes the expansion and decay measure of each day from the day denoted by
# UNIX timestamp start + interval days to the end day denoted by UNIX timestamp and saves
# the results to a .csv file in the form expansion_decay_1661385600_to_1661817600_i=3.csv
# where 1661385600_to_1661817600 denotes the start to end range and i=3 denotes the i value.
# The method expects the working directory to contain the addresses of each day in the form
# innerCore_025_1661126400 where innerCore_025_ is a prefix for identification and 1661126400
# as the UNIX timestamp of the day with a column of nodes.
#
# @param interval Refers to the variable i in the definition of expansion and decay measure
# @param start The UNIX timestamp of the first day
# @param end The UNIX timestamp of the last day (inclusive)
# @param filePrefix The prefix of each file to be parsed containing the nodes of the day
# @param colName Name of the nodes column in each file
def computeExpansionDecay(interval, start, end, filePrefix, colName):
innerCore = {}
currDay = start
while currDay != end + 86400:
df = pd.read_csv(filePrefix + str(currDay) + ".csv")
addresses = df[colName].to_numpy()
innerCore[currDay] = set()
# populate dictionary with set of addresses in each day
for addr in addresses:
innerCore[currDay].add(addr)
currDay += 86400
currDay = start + (interval * 86400) # t
intervalStart = start
timestamp = []
expansionMeasure = []
decayMeasure = []
while currDay != end + 86400:
unionSet = set()
for i in range(int(intervalStart / 86400), int(currDay / 86400)):
unionSet = unionSet.union(innerCore[i * 86400])
expand = innerCore[currDay].difference(unionSet)
decay = unionSet.difference(innerCore[currDay])
timestamp.append(currDay)
expansionMeasure.append(len(expand))
decayMeasure.append(len(decay))
currDay += 86400
intervalStart += 86400
dict = {"timestamp": timestamp, "expansion": expansionMeasure, "decay": decayMeasure}
df = pd.DataFrame(dict)
df.to_csv("expansion_decay_" + str(timestamp[0]) + "_to_" + str(end) + "_i=" + str(
interval) + ".csv")
# Computes the NF-IAF score of each address of each day from UNIX timestamp start to end
# (inclusive) of each motif type and saves the result in descending score order to a .csv
# file for each motif type in the form nfiaf_1648857600_to_1667088000_motif4.csv where
# 1648857600_to_1667088000 denotes the start to end range and motif4 denotes the type of
# motif that all scores are of. The method expects the working directory to contain the
# motif counts of each day in the form of a .csv file named 1648857600_motif3 where 1648857600
# denotes the UNIX timestamp of the day and motif3 denotes the motif type with a column of
# addresses and a column of occurrences in the particular motif type of the day.
#
# @param start The UNIX timestamp of the first day
# @param end The UNIX timestamp of the last day (inclusive)
# @param motifs A list of motif types that serve as the suffix of each .csv file to be parsed
# containing the motif counts
# @param filePrefix The prefix of each file to be parsed containing the motif counts of the day
# @param addrColName Name of the address column name of each file
# @param occurColName Name of the column contianing the motif counts of each file
def computeNFIAF(start, end, motifs, filePrefix, addrColName, occurColName):
dict = {}
# fill each motif set with unique motif center addresses from all days of that motif type
for motif in motifs:
dict[motif] = set()
currDay = start
while currDay != end + 86400:
df = pd.read_csv(filePrefix + str(currDay) + "_" + motif + ".csv")
addresses = df[addrColName].to_numpy()
for addr in addresses:
dict[motif].add(addr)
currDay += 86400
for motif in motifs:
addressCol = []
timestampCol = []
nfiafCol = []
# for every unique center address
for addr in dict[motif]:
currDay = start
occurrences = 0
# compute iaf of each address each day
while currDay != end + 86400:
df = pd.read_csv(filePrefix + str(currDay) + "_" + motif + ".csv")
if addr in df[addrColName].values:
occurrences += 1
currDay += 86400
iaf = math.log10(((end - start) / 86400 + 1) / occurrences)
currDay = start
totalAddr = 0
occurrences = 0
# compute nfiaf for each address each day
while currDay != end + 86400:
df = pd.read_csv(filePrefix + str(currDay) + "_" + motif + ".csv")
addresses = df[addrColName].to_numpy()
occur = df[occurColName].to_numpy()
for i in range(0, len(addresses)):
totalAddr += occur[i]
if addresses[i] == addr:
occurrences = occur[i]
addressCol.append(addr)
timestampCol.append(currDay)
nfiafCol.append(occurrences / totalAddr * iaf)
currDay += 86400
csv_dict = {'address': addressCol, 'timestamp': timestampCol, 'nfiaf': nfiafCol}
df = pd.DataFrame(csv_dict)
df = df.sort_values(by=['nfiaf'], ascending=False)
df = df.reset_index(drop=True)
df.to_csv("nfiaf_" + str(start) + "_to_" + str(end) + "_" + motif + ".csv")
# Counts the number of occurrences each node is a center of a three-node motif in a given NetworkX graph.
#
# @param start The directed NetworkX graph to have the center triad census performed
# @return A dictionary of dictionaries containing the number of occurrences a node is a center
# where the first key is the motif type and the second key is the center node
def centerTriadCensus(graph):
# the triads that contain centers
motifs = {
'S1': nx.DiGraph([(1, 2), (1, 3)]),
'S4': nx.DiGraph([(2, 1), (3, 1)]),
'S5': nx.DiGraph([(1, 2), (2, 3), (1, 3)]),
'S6': nx.DiGraph([(1, 3), (2, 3), (2, 1), (3, 1)]),
'S11': nx.DiGraph([(1, 2), (1, 3), (3, 1), (3, 2)]),
}
# track occurrences for each node that occurs as a center
node_center_counts = {
'motif1': {},
'motif6': {},
'motif5buy': {},
'motif5sell': {},
'motif4': {},
'motif11': {},
}
undir_graph = graph.to_undirected() # undirected since we need to consider neighbors of incoming and outgoing edges
for node in graph:
neighbors = set(undir_graph.neighbors(node))
if len(neighbors) >= 2:
doublets = list(it.combinations(neighbors, 2))
for doub in doublets:
triplet = list(doub)
triplet.append(node) # append the center node itself to form a triplet
# check for isomorphism of triads that contain a center in the subgraph consisting of the triplet nodes
subgraph = graph.subgraph(triplet)
for key, value in motifs.items():
if nx.is_isomorphic(subgraph, value):
if key == 'S1':
for node in subgraph:
if subgraph.out_degree(node) == 2:
if node not in node_center_counts['motif1']:
node_center_counts['motif1'][node] = 1
else:
node_center_counts['motif1'][node] += 1
elif key == 'S6':
for node in subgraph:
if subgraph.out_degree(node) == 2:
if node not in node_center_counts['motif6']:
node_center_counts['motif6'][node] = 1
else:
node_center_counts['motif6'][node] += 1
elif key == 'S5':
for node in subgraph:
if subgraph.out_degree(node) == 2:
if node not in node_center_counts['motif5sell']:
node_center_counts['motif5sell'][node] = 1
else:
node_center_counts['motif5sell'][node] += 1
elif subgraph.in_degree(node) == 2:
if node not in node_center_counts['motif5buy']:
node_center_counts['motif5buy'][node] = 1
else:
node_center_counts['motif5buy'][node] += 1
elif key == 'S4':
for node in subgraph:
if subgraph.in_degree(node) == 2:
if node not in node_center_counts['motif4']:
node_center_counts['motif4'][node] = 1
else:
node_center_counts['motif4'][node] += 1
elif key == 'S11':
for node in subgraph:
if subgraph.in_degree(node) == 2:
if node not in node_center_counts['motif11']:
node_center_counts['motif11'][node] = 1
else:
node_center_counts['motif11'][node] += 1
return node_center_counts