-
Notifications
You must be signed in to change notification settings - Fork 8
/
SEMScore.py
140 lines (122 loc) · 4.94 KB
/
SEMScore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
from numpy.linalg import inv
import math
class SEMBicScore:
def __init__(self, penalty_discount,
dataset=None,
corrs=None, dataset_size=None,
cache_interval=1):
"""Initialize the SEMBicScore object.
Must specify either the dataset or the correlation matrix and dataset size.
:param penalty_discount: Sparsity parameter
:param dataset: 2D np.array, with a row for each data point and a column for each variable
:param corrs: 2D square np.array, the correlation coefficients
:param dataset_size: the number of data points
:param cache_interval: parameter to limit how many partial correlations are cached
"""
self.penalty = penalty_discount
self.cache = {}
self.cache_interval = cache_interval
if dataset is not None:
assert corrs is None and dataset_size is None, \
"SEMBicScore: must specify either dataset or {corrs, dataset_size}"
assert len(dataset.shape) == 2
self.corrcoef = np.corrcoef(dataset.transpose())
self.sample_size = dataset.shape[0]
elif corrs is not None and dataset_size is not None:
assert len(corrs.shape) == 2
assert corrs.shape[0] == corrs.shape[1]
self.corrcoef = corrs
self.sample_size = dataset_size
else:
raise AssertionError("SEMBicScore: must specify either dataset or {corrs, dataset_size}")
# def partial_corr(self, x, y, Z):
# """
# Returns the partial correlation coefficients between elements of X controlling for the elements in Z.
# """
# x_data = self.dataset[:,x]
# y_data = self.dataset[:,y]
# if Z == []:
# return self.corrcoef[x, y]
# Z_data = self.dataset[:,Z]
#
# beta_i = np.linalg.lstsq(Z_data, x_data)[0]
# beta_j = np.linalg.lstsq(Z_data, y_data)[0]
#
# res_j = x_data - Z_data.dot(beta_i)
# res_i = y_data - Z_data.dot(beta_j)
#
# return np.corrcoef(res_i, res_j)[0, 1]
def recursive_partial_corr(self, x, y, Z):
if len(Z) == 0:
return self.corrcoef[x, y]
else:
z0 = min(Z)
Z1 = Z - {z0}
key1 = (frozenset({x, y}), Z1)
if key1 not in self.cache:
term1 = self.recursive_partial_corr(x, y, Z1)
if len(Z1) > 0 and len(Z1) % self.cache_interval == 0:
self.cache[key1] = term1
else:
term1 = self.cache[key1]
key2 = (frozenset({x, z0}), Z1)
if key2 not in self.cache:
term2 = self.recursive_partial_corr(x, z0, Z1)
if len(Z1) > 0 and len(Z1) % self.cache_interval == 0:
self.cache[key2] = term2
else:
term2 = self.cache[key2]
key3 = (frozenset({y, z0}), Z1)
if key3 not in self.cache:
term3 = self.recursive_partial_corr(y, z0, Z1)
if len(Z1) > 0 and len(Z1) % self.cache_interval == 0:
self.cache[key3] = term3
else:
term3 = self.cache[key3]
return (term1 - (term2 * term3)) / math.sqrt((1 - (term2 * term2)) * (1 - (term3 * term3)))
# def local_score(self, node, parents):
# """ `node` is an int index """
# #print("Node:", node, "Parents:", parents)
# variance = self.cov[node][node]
# p = len(parents)
#
# # np.ix_(rowIndices, colIndices)
#
# covxx = self.cov[np.ix_(parents, parents)]
# covxx_inv = inv(covxx)
#
# if (p == 0):
# covxy = []
# else:
# # vector
# covxy = self.cov[np.ix_(parents, [node])]
#
# b = np.dot(covxx_inv, covxy)
#
# variance -= np.dot(np.transpose(covxy), b)
#
# if variance <= 0:
# return None
#
# returnval = self.score(variance, p)
# return returnval
# def local_score_no_parents(self, node):
# """ if node has no parents """
# variance = self.cov[node][node]
#
# if variance <= 0:
# return None
#
# return self.score(variance)
# def score(self, variance, parents_len=0):
# bic = - self.sample_size * math.log(variance) - parents_len * self.penalty * math.log(self.sample_size)
# # TODO: Struct prior?
# return bic
def local_score_diff_parents(self, node1, node2, parents):
parents = frozenset(parents)
r = self.recursive_partial_corr(node1, node2, parents)
return -self.sample_size * math.log(1.0 - r * r) - self.penalty * math.log(self.sample_size)
#return self.local_score(node2, parents + [node1]) - self.local_score(node2, parents)
def local_score_diff(self, node1, node2):
return self.local_score_diff_parents(node1, node2, [])